summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/aio.c13
-rw-r--r--fs/btrfs/async-thread.c44
-rw-r--r--fs/btrfs/async-thread.h28
-rw-r--r--fs/btrfs/btrfs_inode.h13
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c5
-rw-r--r--fs/btrfs/file.c19
-rw-r--r--fs/btrfs/inode.c300
-rw-r--r--fs/btrfs/ioctl.c68
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c9
-rw-r--r--fs/btrfs/reada.c3
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/tree-log.c83
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c64
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/cachefiles/bind.c8
-rw-r--r--fs/cachefiles/daemon.c30
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c2
-rw-r--r--fs/cachefiles/namei.c17
-rw-r--r--fs/cachefiles/rdwr.c6
-rw-r--r--fs/cachefiles/xattr.c10
-rw-r--r--fs/cifs/Kconfig35
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h5
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c16
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/link.c12
-rw-r--r--fs/cifs/netmisc.c20
-rw-r--r--fs/cifs/readdir.c4
-rw-r--r--fs/cifs/sess.c24
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2inode.c2
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2ops.c4
-rw-r--r--fs/cifs/smb2pdu.c7
-rw-r--r--fs/dcache.c111
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/ext4/ext4.h18
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/inode.c44
-rw-r--r--fs/ext4/mballoc.c5
-rw-r--r--fs/ext4/namei.c58
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c5
-rw-r--r--fs/f2fs/Kconfig4
-rw-r--r--fs/f2fs/checkpoint.c175
-rw-r--r--fs/f2fs/data.c86
-rw-r--r--fs/f2fs/debug.c24
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/f2fs.h187
-rw-r--r--fs/f2fs/file.c315
-rw-r--r--fs/f2fs/gc.c34
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/hash.c7
-rw-r--r--fs/f2fs/inline.c58
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c66
-rw-r--r--fs/f2fs/node.c536
-rw-r--r--fs/f2fs/node.h60
-rw-r--r--fs/f2fs/recovery.c219
-rw-r--r--fs/f2fs/segment.c573
-rw-r--r--fs/f2fs/segment.h162
-rw-r--r--fs/f2fs/super.c77
-rw-r--r--fs/f2fs/xattr.c10
-rw-r--r--fs/fscache/object.c1
-rw-r--r--fs/fscache/page.c25
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/file.c15
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/super.c20
-rw-r--r--fs/jbd2/commit.c21
-rw-r--r--fs/jbd2/journal.c56
-rw-r--r--fs/jbd2/recovery.c33
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c20
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namei.c96
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1386
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c285
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c52
-rw-r--r--fs/nfs/filelayout/filelayout.c39
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c6
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4_fs.h13
-rw-r--r--fs/nfs/nfs4client.c38
-rw-r--r--fs/nfs/nfs4proc.c188
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c42
-rw-r--r--fs/nfs/nfs4xdr.c179
-rw-r--r--fs/nfs/objlayout/objio_osd.c113
-rw-r--r--fs/nfs/objlayout/objlayout.c70
-rw-r--r--fs/nfs/objlayout/objlayout.h5
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c105
-rw-r--r--fs/nfs/pnfs.h50
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c150
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs3proc.c13
-rw-r--r--fs/nfsd/nfs4callback.c144
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c49
-rw-r--r--fs/nfsd/nfs4recover.c205
-rw-r--r--fs/nfsd/nfs4state.c115
-rw-r--r--fs/nfsd/nfs4xdr.c92
-rw-r--r--fs/nfsd/nfscache.c214
-rw-r--r--fs/nfsd/nfsctl.c45
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/state.h31
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nfsd/xdr4.h14
-rw-r--r--fs/nilfs2/inode.c7
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/ocfs2/cluster/quorum.c13
-rw-r--r--fs/ocfs2/cluster/tcp.c45
-rw-r--r--fs/ocfs2/cluster/tcp.h1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/ioctl.c129
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/task_mmu.c27
-rw-r--r--fs/stack.c2
-rw-r--r--fs/sync.c2
-rw-r--r--fs/udf/ialloc.c28
-rw-r--r--fs/udf/inode.c161
-rw-r--r--fs/udf/namei.c157
-rw-r--r--fs/udf/super.c69
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/ufs/inode.c7
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c18
-rw-r--r--fs/xfs/xfs_aops.c61
-rw-r--r--fs/xfs/xfs_bmap_util.c20
-rw-r--r--fs/xfs/xfs_file.c27
180 files changed, 6470 insertions, 5589 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
source "fs/nfs/Kconfig"
source "fs/nfsd/Kconfig"
+config GRACE_PERIOD
+ tristate
+
config LOCKD
tristate
depends on FILE_LOCKING
+ select GRACE_PERIOD
config LOCKD_V4
bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
config NFS_COMMON
bool
- depends on NFSD || NFS_FS
+ depends on NFSD || NFS_FS || LOCKD
default y
source "net/sunrpc/Kconfig"
diff --git a/fs/aio.c b/fs/aio.c
index 97bc62cbe2da..733750096b71 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
for (i = 0; i < table->nr; ++i) {
struct kioctx *ctx = table->table[i];
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
if (!ctx)
continue;
@@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
* that it needs to unmap the area, just set it to 0.
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx, NULL);
+ kill_ioctx(mm, ctx, &requests_done);
+
+ /* Wait until all IO for the context are done. */
+ wait_for_completion(&requests_done);
}
RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -1111,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
tail = ring->tail;
kunmap_atomic(ring);
+ /*
+ * Ensure that once we've read the current tail pointer, that
+ * we also see the events that were stored up to the tail.
+ */
+ smp_rmb();
+
pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
if (head == tail)
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..fbd76ded9a34 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
-#include <linux/workqueue.h>
#include "async-thread.h"
#include "ctree.h"
@@ -55,8 +54,39 @@ struct btrfs_workqueue {
struct __btrfs_workqueue *high;
};
-static inline struct __btrfs_workqueue
-*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+static void normal_work_helper(struct btrfs_work *work);
+
+#define BTRFS_WORK_HELPER(name) \
+void btrfs_##name(struct work_struct *arg) \
+{ \
+ struct btrfs_work *work = container_of(arg, struct btrfs_work, \
+ normal_work); \
+ normal_work_helper(work); \
+}
+
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
spin_unlock_irqrestore(lock, flags);
}
-static void normal_work_helper(struct work_struct *arg)
+static void normal_work_helper(struct btrfs_work *work)
{
- struct btrfs_work *work;
struct __btrfs_workqueue *wq;
int need_order = 0;
- work = container_of(arg, struct btrfs_work, normal_work);
/*
* We should not touch things inside work in the following cases:
* 1) after work->func() if it has no ordered_free
@@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg)
trace_btrfs_all_work_done(work);
}
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
btrfs_func_t func,
btrfs_func_t ordered_func,
btrfs_func_t ordered_free)
@@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work,
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
- INIT_WORK(&work->normal_work, normal_work_helper);
+ INIT_WORK(&work->normal_work, uniq_func);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e9e31c94758f 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
@@ -38,11 +40,35 @@ struct btrfs_work {
unsigned long flags;
};
+#define BTRFS_WORK_HELPER_PROTO(name) \
+void btrfs_##name(struct work_struct *arg)
+
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
int flags,
int max_active,
int thresh);
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
btrfs_func_t func,
btrfs_func_t ordered_func,
btrfs_func_t ordered_free);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..56b8522d5767 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -234,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
BTRFS_I(inode)->last_sub_trans <=
- BTRFS_I(inode)->root->last_log_commit)
- return 1;
+ BTRFS_I(inode)->root->last_log_commit) {
+ /*
+ * After a ranged fsync we might have left some extent maps
+ * (that fall outside the fsync's range). So return false
+ * here if the list isn't empty, to make sure btrfs_log_inode()
+ * will be called and process those extent maps.
+ */
+ smp_mb();
+ if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+ return 1;
+ }
return 0;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..a2e90f855d7d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
- NULL, NULL);
+ btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
+ btrfs_async_run_delayed_root, NULL, NULL);
async_work->nr = nr;
btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0ed9e664f7d..a1d36e62179c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
#include "btrfs_inode.h"
#include "volumes.h"
#include "print-tree.h"
-#include "async-thread.h"
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
@@ -693,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
fs_info = end_io_wq->info;
end_io_wq->error = err;
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
if (bio->bi_rw & REQ_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- btrfs_queue_work(fs_info->endio_meta_write_workers,
- &end_io_wq->work);
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- btrfs_queue_work(fs_info->endio_freespace_worker,
- &end_io_wq->work);
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_work(fs_info->endio_raid56_workers,
- &end_io_wq->work);
- else
- btrfs_queue_work(fs_info->endio_write_workers,
- &end_io_wq->work);
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ wq = fs_info->endio_meta_write_workers;
+ func = btrfs_endio_meta_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_work(fs_info->endio_raid56_workers,
- &end_io_wq->work);
- else if (end_io_wq->metadata)
- btrfs_queue_work(fs_info->endio_meta_workers,
- &end_io_wq->work);
- else
- btrfs_queue_work(fs_info->endio_workers,
- &end_io_wq->work);
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else if (end_io_wq->metadata) {
+ wq = fs_info->endio_meta_workers;
+ func = btrfs_endio_meta_helper;
+ } else {
+ wq = fs_info->endio_workers;
+ func = btrfs_endio_helper;
+ }
}
+
+ btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_queue_work(wq, &end_io_wq->work);
}
/*
@@ -828,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
- btrfs_init_work(&async->work, run_one_async_start,
+ btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
run_one_async_done, run_one_async_free);
async->bio_flags = bio_flags;
@@ -3450,7 +3455,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
btrfs_set_stack_device_generation(dev_item, 0);
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
- btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+ btrfs_set_stack_device_total_bytes(dev_item,
+ dev->disk_total_bytes);
btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 102ed3143976..3efe1c3877bf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+ caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
async->sync = 0;
init_completion(&async->wait);
- btrfs_init_work(&async->work, delayed_ref_async_start,
- NULL, NULL);
+ btrfs_init_work(&async->work, btrfs_extent_refs_helper,
+ delayed_ref_async_start, NULL, NULL);
btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
*/
static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- u64 num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
u64 tmp;
@@ -8440,13 +8435,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (stripped)
return extended_to_chunk(stripped);
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ num_devices = root->fs_info->fs_devices->rw_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..af0359dcf337 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
uptodate = 0;
+ offset += len;
continue;
}
}
@@ -4207,8 +4208,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
- start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
- len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+ start = round_down(start, BTRFS_I(inode)->root->sectorsize);
+ len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
/*
* lookup the last file extent. We're not using i_size here
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3afac292d67..ff1cc0399b9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1840,7 +1840,15 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
- filemap_flush(inode->i_mapping);
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
return 0;
}
@@ -1958,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx);
- ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+ ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2088,10 +2096,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
goto out;
}
- if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+ if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
u64 num_bytes;
- path->slots[0]++;
key.offset = offset;
btrfs_set_item_key_safe(root, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2216,7 +2223,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
lockend = round_down(offset + len,
BTRFS_I(inode)->root->sectorsize) - 1;
same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2277,7 +2284,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start + tail_len, 0, 1);
if (ret)
goto out_only_mutex;
- }
+ }
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03708ef3deef..016c403bfe7e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -778,8 +778,12 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- if (ret)
+ if (ret) {
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
goto out_free_reserve;
+ }
/*
* clear dirty, set writeback and unlock the pages.
@@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode,
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
if (ret)
- goto out_reserve;
+ goto out_drop_extent_cache;
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
if (ret)
- goto out_reserve;
+ goto out_drop_extent_cache;
}
if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode,
out:
return ret;
+out_drop_extent_cache:
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
@@ -1096,8 +1102,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
- btrfs_init_work(&async_cow->work, async_cow_start,
- async_cow_submit, async_cow_free);
+ btrfs_init_work(&async_cow->work,
+ btrfs_delalloc_helper,
+ async_cow_start, async_cow_submit,
+ async_cow_free);
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
@@ -1881,7 +1889,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
SetPageChecked(page);
page_cache_get(page);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+ btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
return -EBUSY;
@@ -2822,7 +2831,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2831,13 +2841,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
end - start + 1, uptodate))
return 0;
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+ if (btrfs_is_free_space_inode(inode)) {
+ wq = root->fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = root->fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
- if (btrfs_is_free_space_inode(inode))
- workers = root->fs_info->endio_freespace_worker;
- else
- workers = root->fs_info->endio_write_workers;
- btrfs_queue_work(workers, &ordered_extent->work);
+ btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+ NULL);
+ btrfs_queue_work(wq, &ordered_extent->work);
return 0;
}
@@ -4234,7 +4248,8 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
- if (last_size != (u64)-1)
+ if (last_size != (u64)-1 &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
return err;
@@ -4674,6 +4689,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
remove_extent_mapping(map_tree, em);
free_extent_map(em);
+ if (need_resched()) {
+ write_unlock(&map_tree->lock);
+ cond_resched();
+ write_lock(&map_tree->lock);
+ }
}
write_unlock(&map_tree->lock);
@@ -4696,6 +4716,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
&cached_state, GFP_NOFS);
free_extent_state(state);
+ cond_resched();
spin_lock(&io_tree->lock);
}
spin_unlock(&io_tree->lock);
@@ -5181,6 +5202,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
+ /*
+ * If orphan cleanup did remove any orphans, it means the tree
+ * was modified and therefore the commit root is not the same as
+ * the current root anymore. This is a problem, because send
+ * uses the commit root and therefore can see inode items that
+ * don't exist in the current root anymore, and for example make
+ * calls to btrfs_iget, which will do tree lookups based on the
+ * current root and not on the commit root. Those lookups will
+ * fail, returning a -ESTALE error, and making send fail with
+ * that error. So make sure a send does not see any orphans we
+ * have just removed, and that it will see the same inodes
+ * regardless of whether a transaction commit happened before
+ * it started (meaning that the commit root will be the same as
+ * the current root) or not.
+ */
+ if (sub_root->node != sub_root->commit_root) {
+ u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+
+ if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+ struct extent_buffer *eb;
+
+ /*
+ * Assert we can't have races between dentry
+ * lookup called through the snapshot creation
+ * ioctl and the VFS.
+ */
+ ASSERT(mutex_is_locked(&dir->i_mutex));
+
+ down_write(&root->fs_info->commit_root_sem);
+ eb = sub_root->commit_root;
+ sub_root->commit_root =
+ btrfs_root_node(sub_root);
+ up_write(&root->fs_info->commit_root_sem);
+ free_extent_buffer(eb);
+ }
+ }
}
return inode;
@@ -5577,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
return ret;
}
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+ struct btrfs_iget_args args;
+ args.location = &BTRFS_I(inode)->location;
+ args.root = BTRFS_I(inode)->root;
+
+ return insert_inode_locked4(inode,
+ btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+ btrfs_find_actor, &args);
+}
+
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir,
@@ -5606,6 +5674,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
}
/*
+ * O_TMPFILE, set link count to 0, so that after this point,
+ * we fill in an inode item with the correct link count.
+ */
+ if (!name)
+ set_nlink(inode, 0);
+
+ /*
* we have to initialize this early, so we can reclaim the inode
* number if we fail afterwards in this function.
*/
@@ -5662,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[1] = name_len + sizeof(*ref);
}
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0)
+ goto fail;
+
path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
- goto fail;
+ goto fail_unlock;
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
@@ -5688,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) {
@@ -5703,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- btrfs_insert_inode_hash(inode);
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
@@ -5718,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_ino(inode), root->root_key.objectid, ret);
return inode;
+
+fail_unlock:
+ unlock_new_inode(inode);
fail:
if (dir && name)
BTRFS_I(dir)->index_cnt--;
@@ -5852,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
* if the filesystem supports xattrs by looking at the
* ops vector.
*/
-
inode->i_op = &btrfs_special_inode_operations;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ init_special_inode(inode, inode->i_mode, rdev);
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
- drop_inode = 1;
- else {
- init_special_inode(inode, inode->i_mode, rdev);
+ goto out_unlock_inode;
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ if (err) {
+ goto out_unlock_inode;
+ } else {
btrfs_update_inode(trans, root, inode);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
}
+
out_unlock:
btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
@@ -5883,6 +5964,12 @@ out_unlock:
iput(inode);
}
return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
+
}
static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5917,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
drop_inode_on_err = 1;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, inode);
- if (err)
- goto out_unlock;
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -5934,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_unlock_inode;
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- goto out_unlock;
+ goto out_unlock_inode;
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
out_unlock:
@@ -5953,6 +6040,11 @@ out_unlock:
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
+
+out_unlock_inode:
+ unlock_new_inode(inode);
+ goto out_unlock;
+
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6060,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
drop_on_err = 1;
+ /* these must be set before we unlock the inode */
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
- goto out_fail;
-
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
+ goto out_fail_inode;
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, root, inode);
if (err)
- goto out_fail;
+ goto out_fail_inode;
err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
dentry->d_name.len, 0, index);
if (err)
- goto out_fail;
+ goto out_fail_inode;
d_instantiate(dentry, inode);
+ /*
+ * mkdir is special. We're unlocking after we call d_instantiate
+ * to avoid a race with nfsd calling d_instantiate.
+ */
+ unlock_new_inode(inode);
drop_on_err = 0;
out_fail:
@@ -6088,6 +6185,10 @@ out_fail:
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
+
+out_fail_inode:
+ unlock_new_inode(inode);
+ goto out_fail;
}
/* helper for btfs_get_extent. Given an existing extent in the tree,
@@ -6097,14 +6198,14 @@ out_fail:
static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
- u64 map_start, u64 map_len)
+ u64 map_start)
{
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
start_diff = map_start - em->start;
em->start = map_start;
- em->len = map_len;
+ em->len = existing->start - em->start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
@@ -6275,6 +6376,8 @@ next:
goto not_found;
if (start + len <= found_key.offset)
goto not_found;
+ if (start > found_key.offset)
+ goto next;
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
@@ -6390,8 +6493,7 @@ insert:
em->len);
if (existing) {
err = merge_extent_mapping(em_tree, existing,
- em, start,
- root->sectorsize);
+ em, start);
free_extent_map(existing);
if (err) {
free_extent_map(em);
@@ -7158,7 +7260,8 @@ again:
if (!ret)
goto out_test;
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+ finish_ordered_fn, NULL, NULL);
btrfs_queue_work(root->fs_info->endio_write_workers,
&ordered->work);
out_test:
@@ -7306,10 +7409,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
- if (ret) {
- bio_put(orig_bio);
+ if (ret)
return -EIO;
- }
if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
@@ -7326,6 +7427,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
if (!bio)
return -ENOMEM;
+
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
atomic_inc(&dip->pending_bios);
@@ -7534,7 +7636,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
count = iov_iter_count(iter);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset, count);
+ filemap_fdatawrite_range(inode->i_mapping, offset,
+ offset + count - 1);
if (rw & WRITE) {
/*
@@ -8041,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
+ unlock_new_inode(inode);
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
if (err)
@@ -8495,7 +8599,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
work->inode = inode;
work->wait = wait;
work->delay_iput = delay_iput;
- btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+ WARN_ON_ONCE(!inode);
+ btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+ btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -8699,12 +8805,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -8713,23 +8813,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- drop_inode = 1;
- else {
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- }
- if (drop_inode)
- goto out_unlock;
+ goto out_unlock_inode;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- drop_inode = 1;
- goto out_unlock;
+ goto out_unlock_inode;
}
key.objectid = btrfs_ino(inode);
key.offset = 0;
@@ -8738,9 +8837,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
- drop_inode = 1;
btrfs_free_path(path);
- goto out_unlock;
+ goto out_unlock_inode;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8764,12 +8862,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
- if (err)
+ if (err) {
drop_inode = 1;
+ goto out_unlock_inode;
+ }
+
+ unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
out_unlock:
- if (!err)
- d_instantiate(dentry, inode);
btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -8777,6 +8878,11 @@ out_unlock:
}
btrfs_btree_balance_dirty(root);
return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8960,14 +9066,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- goto out;
-
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
@@ -8975,10 +9073,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out_inode;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out_inode;
ret = btrfs_orphan_add(trans, inode);
if (ret)
- goto out;
+ goto out_inode;
+ /*
+ * We set number of links to 0 in btrfs_new_inode(), and here we set
+ * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+ * through:
+ *
+ * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+ */
+ set_nlink(inode, 1);
+ unlock_new_inode(inode);
d_tmpfile(dentry, inode);
mark_inode_dirty(inode);
@@ -8988,8 +9102,12 @@ out:
iput(inode);
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
-
return ret;
+
+out_inode:
+ unlock_new_inode(inode);
+ goto out;
+
}
static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..8a8e29878c34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
- ret = btrfs_orphan_cleanup(pending_snapshot->snap);
- if (ret)
- goto fail;
-
- /*
- * If orphan cleanup did remove any orphans, it means the tree was
- * modified and therefore the commit root is not the same as the
- * current root anymore. This is a problem, because send uses the
- * commit root and therefore can see inode items that don't exist
- * in the current root anymore, and for example make calls to
- * btrfs_iget, which will do tree lookups based on the current root
- * and not on the commit root. Those lookups will fail, returning a
- * -ESTALE error, and making send fail with that error. So make sure
- * a send does not see any orphans we have just removed, and that it
- * will see the same inodes regardless of whether a transaction
- * commit happened before it started (meaning that the commit root
- * will be the same as the current root) or not.
- */
- if (readonly && pending_snapshot->snap->node !=
- pending_snapshot->snap->commit_root) {
- trans = btrfs_join_transaction(pending_snapshot->snap);
- if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
- ret = PTR_ERR(trans);
- goto fail;
- }
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans,
- pending_snapshot->snap);
- if (ret)
- goto fail;
- }
- }
-
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -1052,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
- (em->block_start + em->block_len == next->block_start))
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = false;
+ else if ((em->block_start + em->block_len == next->block_start) &&
+ (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
ret = false;
free_extent_map(next);
@@ -1088,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
}
next_mergeable = defrag_check_next_extent(inode, em);
-
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
@@ -1735,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
BTRFS_SUBVOL_QGROUP_INHERIT)) {
ret = -EOPNOTSUPP;
- goto out;
+ goto free_args;
}
if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
if (vol_args->size > PAGE_CACHE_SIZE) {
ret = -EINVAL;
- goto out;
+ goto free_args;
}
inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
if (IS_ERR(inherit)) {
ret = PTR_ERR(inherit);
- goto out;
+ goto free_args;
}
}
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol, ptr,
readonly, inherit);
+ if (ret)
+ goto free_inherit;
- if (ret == 0 && ptr &&
- copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid), ptr, sizeof(*ptr)))
+ if (ptr && copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_vol_args_v2,
+ transid),
+ ptr, sizeof(*ptr)))
ret = -EFAULT;
-out:
- kfree(vol_args);
+
+free_inherit:
kfree(inherit);
+free_args:
+ kfree(vol_args);
return ret;
}
@@ -2685,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto err_drop;
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2703,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
out:
kfree(vol_args);
+err_drop:
mnt_drop_write_file(file);
return ret;
}
@@ -3527,7 +3500,8 @@ process_slot:
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- last_dest_end = new_key.offset + datal;
+ last_dest_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
destoff, olen);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 963895c1f801..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
+ btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b497498484be..ded5c601d916 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1973,7 +1973,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
elem.seq, &roots);
btrfs_put_tree_mod_seq(fs_info, &elem);
if (ret < 0)
- return ret;
+ goto out;
if (roots->nnodes != 1)
goto out;
@@ -2720,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..0a6b6e4bcbb9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,7 +1416,8 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ rmw_work, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers,
&rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
{
- btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ read_rebuild_work, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers,
&rbio->work);
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, btrfs_rmw_helper,
+ unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..20408c6b665a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+ btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+ reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..f4a41f37be22 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
- NULL, NULL);
+ btrfs_init_work(&sbio->work, btrfs_scrub_helper,
+ scrub_bio_end_io_worker, NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -999,8 +999,8 @@ nodatasum_case:
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
- btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
- NULL, NULL);
+ btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
+ scrub_fixup_nodatasum, NULL, NULL);
btrfs_queue_work(fs_info->scrub_workers,
&fixup_nodatasum->work);
goto out;
@@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+ scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
@@ -2904,6 +2905,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ struct rcu_string *name;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -2965,6 +2967,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -ENODEV;
}
+ if (!is_dev_replace && !readonly && !dev->writeable) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ btrfs_err(fs_info, "scrub: device %s is not writable",
+ name->str);
+ rcu_read_unlock();
+ return -EROFS;
+ }
+
mutex_lock(&fs_info->scrub_lock);
if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->len = len;
nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
- btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+ btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+ copy_nocow_pages_worker, NULL, NULL);
INIT_LIST_HEAD(&nocow_ctx->inodes);
btrfs_queue_work(fs_info->scrub_nocow_workers,
&nocow_ctx->work);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..12e53556e214 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
if (!fs_info->device_dir_kobj)
return -EINVAL;
- if (one_device) {
+ if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..d0262ceb85e1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
#define LOG_WALK_REPLAY_ALL 3
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct list_head ordered_sums;
int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
bool has_extents = false;
- bool need_find_last_extent = (*last_extent == 0);
+ bool need_find_last_extent = true;
bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
*/
if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
has_extents = true;
- if (need_find_last_extent &&
- first_key.objectid == (u64)-1)
+ if (first_key.objectid == (u64)-1)
first_key = ins_keys[i];
} else {
need_find_last_extent = false;
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!has_extents)
return ret;
+ if (need_find_last_extent && *last_extent == first_key.offset) {
+ /*
+ * We don't have any leafs between our current one and the one
+ * we processed before that can have file extent items for our
+ * inode (and have a generation number smaller than our current
+ * transaction id).
+ */
+ need_find_last_extent = false;
+ }
+
/*
* Because we use btrfs_search_forward we could skip leaves that were
* not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
0, 0);
if (ret)
break;
- *last_extent = offset + len;
+ *last_extent = extent_end;
}
/*
* Need to let the callers know we dropped the path so they should
@@ -3849,8 +3860,10 @@ process:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3867,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
path = btrfs_alloc_path();
if (!path)
@@ -3980,7 +3994,8 @@ again:
if (ret < 0) {
err = ret;
goto out_unlock;
- } if (ret) {
+ }
+ if (ret) {
ins_nr = 0;
btrfs_release_path(path);
continue;
@@ -4040,13 +4055,35 @@ log_extents:
goto out_unlock;
}
} else if (inode_only == LOG_INODE_ALL) {
- struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em, *n;
- write_lock(&tree->lock);
- list_for_each_entry_safe(em, n, &tree->modified_extents, list)
- list_del_init(&em->list);
- write_unlock(&tree->lock);
+ write_lock(&em_tree->lock);
+ /*
+ * We can't just remove every em if we're called for a ranged
+ * fsync - that is, one that doesn't cover the whole possible
+ * file range (0 to LLONG_MAX). This is because we can have
+ * em's that fall outside the range we're logging and therefore
+ * their ordered operations haven't completed yet
+ * (btrfs_finish_ordered_io() not invoked yet). This means we
+ * didn't get their respective file extent item in the fs/subvol
+ * tree yet, and need to let the next fast fsync (one which
+ * consults the list of modified extent maps) find the em so
+ * that it logs a matching file extent item and waits for the
+ * respective ordered operation to complete (if it's still
+ * running).
+ *
+ * Removing every em outside the range we're logging would make
+ * the next fast fsync not log their matching file extent items,
+ * therefore making us lose data after a log replay.
+ */
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+ list) {
+ const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+ if (em->mod_start >= start && mod_end <= end)
+ list_del_init(&em->list);
+ }
+ write_unlock(&em_tree->lock);
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4093,7 @@ log_extents:
goto out_unlock;
}
}
+
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
@@ -4152,7 +4190,10 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only,
+ struct dentry *parent,
+ const loff_t start,
+ const loff_t end,
+ int exists_only,
struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4239,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
if (ret)
goto end_trans;
@@ -4226,7 +4267,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, root, inode, inode_only,
+ 0, LLONG_MAX);
if (ret)
goto end_trans;
}
@@ -4260,13 +4302,15 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
- 0, ctx);
+ start, end, 0, ctx);
dput(parent);
return ret;
@@ -4503,6 +4547,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+ LLONG_MAX, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..e2e798ae7cd7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..2c2d6d1d8eee 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -508,6 +508,43 @@ static noinline int device_list_add(const char *path,
ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
+ /*
+ * When FS is already mounted.
+ * 1. If you are here and if the device->name is NULL that
+ * means this device was missing at time of FS mount.
+ * 2. If you are here and if the device->name is different
+ * from 'path' that means either
+ * a. The same device disappeared and reappeared with
+ * different name. or
+ * b. The missing-disk-which-was-replaced, has
+ * reappeared now.
+ *
+ * We must allow 1 and 2a above. But 2b would be a spurious
+ * and unintentional.
+ *
+ * Further in case of 1 and 2a above, the disk at 'path'
+ * would have missed some transaction when it was away and
+ * in case of 2a the stale bdev has to be updated as well.
+ * 2b must not be allowed at all time.
+ */
+
+ /*
+ * For now, we do allow update to btrfs_fs_device through the
+ * btrfs dev scan cli after FS has been mounted. We're still
+ * tracking a problem where systems fail mount by subvolume id
+ * when we reject replacement on a mounted FS.
+ */
+ if (!fs_devices->opened && found_transid < device->generation) {
+ /*
+ * That is if the FS is _not_ mounted and if you
+ * are here, that means there is more than one
+ * disk with same uuid and devid.We keep the one
+ * with larger generation number or the last-in if
+ * generation are equal.
+ */
+ return -EEXIST;
+ }
+
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
@@ -519,6 +556,15 @@ static noinline int device_list_add(const char *path,
}
}
+ /*
+ * Unmount does not free the btrfs_device struct but would zero
+ * generation along with most of the other members. So just update
+ * it back. We need it to pick the disk with largest generation
+ * (as above).
+ */
+ if (!fs_devices->opened)
+ device->generation = found_transid;
+
if (found_transid > fs_devices->latest_trans) {
fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid;
@@ -1436,7 +1482,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
@@ -1671,7 +1717,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
device->fs_devices->total_devices--;
if (device->missing)
- root->fs_info->fs_devices->missing_devices--;
+ device->fs_devices->missing_devices--;
next_device = list_entry(root->fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
@@ -1801,8 +1847,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->bdev) {
fs_info->fs_devices->open_devices--;
- /* zero out the old super */
- btrfs_scratch_superblock(srcdev);
+ /*
+ * zero out the old super if it is not writable
+ * (e.g. seed device)
+ */
+ if (srcdev->writeable)
+ btrfs_scratch_superblock(srcdev);
}
call_rcu(&srcdev->rcu, free_device);
@@ -1941,6 +1991,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
+ fs_devices->missing_devices = 0;
+ fs_devices->num_can_discard = 0;
+ fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -5800,7 +5853,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+ btrfs_init_work(&dev->work, btrfs_submit_helper,
+ pending_bios_fn, NULL, NULL);
return dev;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..3588a80854b2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
bh = page_buffers(page);
if (bh->b_size == size) {
end_block = init_page_buffers(page, bdev,
- index << sizebits, size);
+ (sector_t)index << sizebits,
+ size);
goto done;
}
if (!try_to_free_buffers(page))
@@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
*/
spin_lock(&inode->i_mapping->private_lock);
link_dev_buffers(page, bh);
- end_block = init_page_buffers(page, bdev, index << sizebits, size);
+ end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+ size);
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent < 100);
if (*args) {
- pr_err("'bind' command doesn't take an argument");
+ pr_err("'bind' command doesn't take an argument\n");
return -EINVAL;
}
if (!cache->rootdirname) {
- pr_err("No cache directory specified");
+ pr_err("No cache directory specified\n");
return -EINVAL;
}
/* don't permit already bound caches to be re-bound */
if (test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("Cache already bound");
+ pr_err("Cache already bound\n");
return -EBUSY;
}
@@ -248,7 +248,7 @@ error_open_root:
kmem_cache_free(cachefiles_object_jar, fsdef);
error_root_object:
cachefiles_end_secure(cache, saved_cred);
- pr_err("Failed to register: %d", ret);
+ pr_err("Failed to register: %d\n", ret);
return ret;
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
char *args)
{
- pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
+ pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
return -EINVAL;
}
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty directory specified");
+ pr_err("Empty directory specified\n");
return -EINVAL;
}
if (cache->rootdirname) {
- pr_err("Second cache directory specified");
+ pr_err("Second cache directory specified\n");
return -EEXIST;
}
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty security context specified");
+ pr_err("Empty security context specified\n");
return -EINVAL;
}
if (cache->secctx) {
- pr_err("Second security context specified");
+ pr_err("Second security context specified\n");
return -EINVAL;
}
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty tag specified");
+ pr_err("Empty tag specified\n");
return -EINVAL;
}
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("cull applied to unready cache");
+ pr_err("cull applied to unready cache\n");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- pr_err("cull applied to dead cache");
+ pr_err("cull applied to dead cache\n");
return -EIO;
}
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- pr_err("cull command requires dirfd to be a directory");
+ pr_err("cull command requires dirfd to be a directory\n");
return -ENOTDIR;
inval:
- pr_err("cull command requires dirfd and filename");
+ pr_err("cull command requires dirfd and filename\n");
return -EINVAL;
}
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
return 0;
inval:
- pr_err("debug command requires mask");
+ pr_err("debug command requires mask\n");
return -EINVAL;
}
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("inuse applied to unready cache");
+ pr_err("inuse applied to unready cache\n");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- pr_err("inuse applied to dead cache");
+ pr_err("inuse applied to dead cache\n");
return -EIO;
}
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- pr_err("inuse command requires dirfd to be a directory");
+ pr_err("inuse command requires dirfd to be a directory\n");
return -ENOTDIR;
inval:
- pr_err("inuse command requires dirfd and filename");
+ pr_err("inuse command requires dirfd and filename\n");
return -EINVAL;
}
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
#define cachefiles_io_error(___cache, FMT, ...) \
do { \
- pr_err("I/O Error: " FMT, ##__VA_ARGS__); \
+ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
fscache_io_error(&(___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
} while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
error_object_jar:
misc_deregister(&cachefiles_dev);
error_dev:
- pr_err("failed to register: %d", ret);
+ pr_err("failed to register: %d\n", ret);
return ret;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..dad7d9542a24 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -543,7 +543,7 @@ lookup_again:
next, next->d_inode, next->d_inode->i_ino);
} else if (!S_ISDIR(next->d_inode->i_mode)) {
- pr_err("inode %lu is not a directory",
+ pr_err("inode %lu is not a directory\n",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -574,7 +574,7 @@ lookup_again:
} else if (!S_ISDIR(next->d_inode->i_mode) &&
!S_ISREG(next->d_inode->i_mode)
) {
- pr_err("inode %lu is not a file or directory",
+ pr_err("inode %lu is not a file or directory\n",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
ASSERT(subdir->d_inode);
if (!S_ISDIR(subdir->d_inode->i_mode)) {
- pr_err("%s is not a directory", dirname);
+ pr_err("%s is not a directory\n", dirname);
ret = -EIO;
goto check_error;
}
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
!subdir->d_inode->i_op->lookup ||
!subdir->d_inode->i_op->mkdir ||
!subdir->d_inode->i_op->create ||
- !subdir->d_inode->i_op->rename ||
+ (!subdir->d_inode->i_op->rename &&
+ !subdir->d_inode->i_op->rename2) ||
!subdir->d_inode->i_op->rmdir ||
!subdir->d_inode->i_op->unlink)
goto check_error;
@@ -795,13 +796,13 @@ check_error:
mkdir_error:
mutex_unlock(&dir->d_inode->i_mutex);
dput(subdir);
- pr_err("mkdir %s failed with error %d", dirname, ret);
+ pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
mutex_unlock(&dir->d_inode->i_mutex);
ret = PTR_ERR(subdir);
- pr_err("Lookup %s failed with error %d", dirname, ret);
+ pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
if (ret == -EIO) {
cachefiles_io_error(cache, "Lookup failed");
} else if (ret != -ENOMEM) {
- pr_err("Internal error: %d", ret);
+ pr_err("Internal error: %d\n", ret);
ret = -EIO;
}
@@ -950,7 +951,7 @@ error:
}
if (ret != -ENOMEM) {
- pr_err("Internal error: %d", ret);
+ pr_err("Internal error: %d\n", ret);
ret = -EIO;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..25e745b8eb1b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
struct cachefiles_one_read *monitor;
struct cachefiles_object *object;
struct fscache_retrieval *op;
- struct pagevec pagevec;
int error, max;
op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
_enter("{ino=%lu}", object->backer->d_inode->i_ino);
- pagevec_init(&pagevec, 0);
-
max = 8;
spin_lock_irq(&object->work_lock);
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- struct pagevec pagevec;
struct inode *inode;
sector_t block0, block;
unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
op->op.flags |= FSCACHE_OP_ASYNC;
op->op.processor = cachefiles_read_copier;
- pagevec_init(&pagevec, 0);
-
/* we assume the absence or presence of the first block is a good
* enough indication for the page as a whole
* - TODO: don't use bmap() for this as it is _not_ actually good
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
}
if (ret != -EEXIST) {
- pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
if (ret == -ERANGE)
goto bad_type_length;
- pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -85,14 +85,14 @@ error:
return ret;
bad_type_length:
- pr_err("Cache object %lu type xattr length incorrect",
+ pr_err("Cache object %lu type xattr length incorrect\n",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
bad_type:
xtype[2] = 0;
- pr_err("Cache object %*.*s [%lu] type %s not %s",
+ pr_err("Cache object %*.*s [%lu] type %s not %s\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
xtype, type);
@@ -293,7 +293,7 @@ error:
return ret;
bad_type_length:
- pr_err("Cache object %lu xattr length incorrect",
+ pr_err("Cache object %lu xattr length incorrect\n",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
support for OS/2 and Windows ME and similar servers is provided as
well.
+ The module also provides optional support for the followon
+ protocols for CIFS including SMB3, which enables
+ useful performance and security features (see the description
+ of CONFIG_CIFS_SMB2).
+
The cifs module provides an advanced network file system
client for mounting to CIFS compliant servers. It includes
support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
depends on CIFS_XATTR && KEYS
help
Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller.
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB2
- bool "SMB2 network file system support"
+ bool "SMB2 and SMB3 network file system support"
depends on CIFS && INET
select NLS
select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
select DNS_RESOLVER
help
- This enables experimental support for the SMB2 (Server Message Block
- version 2) protocol. The SMB2 protocol is the successor to the
- popular CIFS and SMB network file sharing protocols. SMB2 is the
- native file sharing mechanism for recent versions of Windows
- operating systems (since Vista). SMB2 enablement will eventually
- allow users better performance, security and features, than would be
- possible with cifs. Note that smb2 mount options also are simpler
- (compared to cifs) due to protocol improvements.
-
- Unless you are a developer or tester, say N.
+ This enables support for the Server Message Block version 2
+ family of protocols, including SMB3. SMB3 support is
+ enabled on mount by specifying "vers=3.0" in the mount
+ options. These protocols are the successors to the popular
+ CIFS and SMB network file sharing protocols. SMB3 is the
+ native file sharing mechanism for the more recent
+ versions of Windows (Windows 8 and Windows 2012 and
+ later) and Samba server and many others support SMB3 well.
+ In general SMB3 enables better performance, security
+ and features, than would be possible with CIFS (Note that
+ when mounting to Samba, due to the CIFS POSIX extensions,
+ CIFS mounts can provide slightly better POSIX compatibility
+ than SMB3 mounts do though). Note that SMB2/SMB3 mount
+ options are also slightly simpler (compared to CIFS) due
+ to protocol improvements.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b0fafa499505..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.04"
+#define CIFS_VERSION "2.05"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index dfc731b02aa9..25b8392bfdd2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
#define SERVER_NAME_LENGTH 40
#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
-/* used to define string lengths for reversing unicode strings */
-/* (256+1)*2 = 514 */
-/* (max path length + 1 for null) * 2 for unicode */
-#define MAX_NAME 514
-
/* SMB echo "timeout" -- FIXME: tunable? */
#define SMB_ECHO_INTERVAL (60 * HZ)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 03ed8a09581c..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
tmp_end++;
if (!(tmp_end < end && tmp_end[1] == delim)) {
/* No it is not. Set the password to NULL */
+ kfree(vol->password);
vol->password = NULL;
break;
}
@@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
options = end;
}
+ kfree(vol->password);
/* Now build new password string */
temp_len = strlen(value);
vol->password = kzalloc(temp_len+1, GFP_KERNEL);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..6cbd9c688cfe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
goto out;
}
+ if (file->f_flags & O_DIRECT &&
+ CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+ if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+ file->f_op = &cifs_file_direct_nobrl_ops;
+ else
+ file->f_op = &cifs_file_direct_ops;
+ }
+
file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
if (file_info == NULL) {
if (server->ops->close)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d5fec92e0360..5f29354b072a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
inode, file->f_flags, full_path);
+ if (file->f_flags & O_DIRECT &&
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+ file->f_op = &cifs_file_direct_nobrl_ops;
+ else
+ file->f_op = &cifs_file_direct_ops;
+ }
+
if (server->oplocks)
oplock = REQ_OPLOCK;
else
@@ -3560,15 +3568,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
lru_cache_add_file(page);
unlock_page(page);
page_cache_release(page);
- if (rc == -EAGAIN)
- list_add_tail(&page->lru, &tmplist);
}
+ /* Fallback to the readpage in error/reconnect cases */
kref_put(&rdata->refcount, cifs_readdata_release);
- if (rc == -EAGAIN) {
- /* Re-add pages to the page_list and retry */
- list_splice(&tmplist, page_list);
- continue;
- }
break;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 949ec909ec9a..7899a40465b3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1720,7 +1720,10 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
unlink_target:
/* Try unlinking the target dentry if it's not negative */
if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
- tmprc = cifs_unlink(target_dir, target_dentry);
+ if (d_is_dir(target_dentry))
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ else
+ tmprc = cifs_unlink(target_dir, target_dentry);
if (tmprc)
goto cifs_rename_exit;
rc = cifs_do_rename(xid, source_dentry, from_name,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..5657416d3483 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto out;
- rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
- fromName, buf, &bytes_written);
+ if (tcon->ses->server->ops->create_mf_symlink)
+ rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+ cifs_sb, fromName, buf, &bytes_written);
+ else
+ rc = -EOPNOTSUPP;
+
if (rc)
goto out;
@@ -339,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+ if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ rc = -ENOENT;
/* it's not a symlink */
goto out;
+ }
io_parms.netfid = fid.netfid;
io_parms.pid = current->tgid;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
/* BB what about the timezone? BB */
/* Subtract the NTFS time offset, then convert to 1s intervals. */
- u64 t;
+ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+
+ /*
+ * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+ * the alternative, do_div, does not work with negative numbers so have
+ * to special case them
+ */
+ if (t < 0) {
+ t = -t;
+ ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+ ts.tv_nsec = -ts.tv_nsec;
+ ts.tv_sec = -t;
+ } else {
+ ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+ ts.tv_sec = t;
+ }
- t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
- ts.tv_nsec = do_div(t, 10000000) * 100;
- ts.tv_sec = t;
return ts;
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 798c80a41c88..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -596,8 +596,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
spin_unlock(&cifs_file_list_lock);
- if (server->ops->close)
- server->ops->close(xid, tcon, &cfile->fid);
+ if (server->ops->close_dir)
+ server->ops->close_dir(xid, tcon, &cfile->fid);
} else
spin_unlock(&cifs_file_list_lock);
if (cfile->srch_inf.ntwrk_buf_start) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 39ee32688eac..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
kfree(ses->serverOS);
ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
- if (ses->serverOS)
+ if (ses->serverOS) {
strncpy(ses->serverOS, bcc_ptr, len);
- if (strncmp(ses->serverOS, "OS/2", 4) == 0)
- cifs_dbg(FYI, "OS/2 server\n");
+ if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+ cifs_dbg(FYI, "OS/2 server\n");
+ }
bcc_ptr += len + 1;
bleft -= len + 1;
@@ -744,14 +745,6 @@ out:
sess_free_buffer(sess_data);
}
-#else
-
-static void
-sess_auth_lanman(struct sess_data *sess_data)
-{
- sess_data->result = -EOPNOTSUPP;
- sess_data->func = NULL;
-}
#endif
static void
@@ -1102,15 +1095,6 @@ out:
ses->auth_key.response = NULL;
}
-#else
-
-static void
-sess_auth_kerberos(struct sess_data *sess_data)
-{
- cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
- sess_data->result = -ENOSYS;
- sess_data->func = NULL;
-}
#endif /* ! CONFIG_CIFS_UPCALL */
/*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 1a6df4b03f67..52131d8cb4d5 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
if (tmprc == -EOPNOTSUPP)
*symlink = true;
- else
+ else if (tmprc == 0)
CIFSSMBClose(xid, tcon, fid.netfid);
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
goto out;
}
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL) {
rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0150182a4494..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
*adjust_tz = false;
*symlink = false;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index af59d03db492..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+ {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+ "STATUS_REPARSE_NOT_HANDLED"},
{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
"STATUS_DEVICE_REQUIRES_CLEANING"},
{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 5a48aa290dfe..f522193b7184 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_file_all_info *smb2_data;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
@@ -1035,7 +1035,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
if (keep_size == false)
return -EOPNOTSUPP;
- /*
+ /*
* Must check if file sparse since fallocate -z (zero range) assumes
* non-sparse allocation
*/
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index fa0dd044213b..74b3a6684383 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
struct smb2_sess_setup_rsp *rsp = NULL;
struct kvec iov[2];
int rc = 0;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
@@ -1403,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
rsp = (struct smb2_close_rsp *)iov[0].iov_base;
if (rc != 0) {
- if (tcon)
- cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+ cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
goto close_exit;
}
@@ -1533,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
{
return query_info(xid, tcon, persistent_fid, volatile_fid,
FILE_ALL_INFORMATION,
- sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
sizeof(struct smb2_file_all_info), data);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..cb25a1a5e307 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
- hash = hash + (hash >> d_hash_shift);
- return dentry_hashtable + (hash & d_hash_mask);
+ return dentry_hashtable + hash_32(hash, d_hash_shift);
}
/* Statistics gathering. */
@@ -2373,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
}
EXPORT_SYMBOL(dentry_update_name_case);
-static void switch_names(struct dentry *dentry, struct dentry *target)
+static void switch_names(struct dentry *dentry, struct dentry *target,
+ bool exchange)
{
if (dname_external(target)) {
if (dname_external(dentry)) {
@@ -2407,13 +2407,19 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
*/
unsigned int i;
BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+ if (!exchange) {
+ memcpy(dentry->d_iname, target->d_name.name,
+ target->d_name.len + 1);
+ dentry->d_name.hash_len = target->d_name.hash_len;
+ return;
+ }
for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
swap(((long *) &dentry->d_iname)[i],
((long *) &target->d_iname)[i]);
}
}
}
- swap(dentry->d_name.len, target->d_name.len);
+ swap(dentry->d_name.hash_len, target->d_name.hash_len);
}
static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2443,25 +2449,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
}
}
-static void dentry_unlock_parents_for_move(struct dentry *dentry,
- struct dentry *target)
+static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
{
if (target->d_parent != dentry->d_parent)
spin_unlock(&dentry->d_parent->d_lock);
if (target->d_parent != target)
spin_unlock(&target->d_parent->d_lock);
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
* When switching names, the actual string doesn't strictly have to
* be preserved in the target - because we're dropping the target
* anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch.
- *
- * Note that we have to be a lot more careful about getting the hash
- * switched - we have to switch the hash value properly even if it
- * then no longer matches the actual (corrupted) string of the target.
- * The hash value has to match the hash queue that the dentry is on..
+ * the new name before we switch, unless we are going to rehash
+ * it. Note that if we *do* unhash the target, we are not allowed
+ * to rehash it without giving it a new name/hash key - whether
+ * we swap or overwrite the names here, resulting name won't match
+ * the reality in filesystem; it's only there for d_path() purposes.
+ * Note that all of this is happening under rename_lock, so the
+ * any hash lookup seeing it in the middle of manipulations will
+ * be discarded anyway. So we do not care what happens to the hash
+ * key in that case.
*/
/*
* __d_move - move a dentry
@@ -2507,36 +2517,30 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
d_hash(dentry->d_parent, dentry->d_name.hash));
}
- list_del(&dentry->d_u.d_child);
- list_del(&target->d_u.d_child);
-
/* Switch the names.. */
- switch_names(dentry, target);
- swap(dentry->d_name.hash, target->d_name.hash);
+ switch_names(dentry, target, exchange);
- /* ... and switch the parents */
+ /* ... and switch them in the tree */
if (IS_ROOT(dentry)) {
+ /* splicing a tree */
dentry->d_parent = target->d_parent;
target->d_parent = target;
- INIT_LIST_HEAD(&target->d_u.d_child);
+ list_del_init(&target->d_u.d_child);
+ list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
} else {
+ /* swapping two dentries */
swap(dentry->d_parent, target->d_parent);
-
- /* And add them back to the (new) parent lists */
- list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
+ list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
+ list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+ if (exchange)
+ fsnotify_d_move(target);
+ fsnotify_d_move(dentry);
}
- list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
-
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
- dentry_unlock_parents_for_move(dentry, target);
- if (exchange)
- fsnotify_d_move(target);
- spin_unlock(&target->d_lock);
- fsnotify_d_move(dentry);
- spin_unlock(&dentry->d_lock);
+ dentry_unlock_for_move(dentry, target);
}
/*
@@ -2634,39 +2638,6 @@ out_err:
return ret;
}
-/*
- * Prepare an anonymous dentry for life in the superblock's dentry tree as a
- * named dentry in place of the dentry to be replaced.
- * returns with anon->d_lock held!
- */
-static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
-{
- struct dentry *dparent;
-
- dentry_lock_for_move(anon, dentry);
-
- write_seqcount_begin(&dentry->d_seq);
- write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
-
- dparent = dentry->d_parent;
-
- switch_names(dentry, anon);
- swap(dentry->d_name.hash, anon->d_name.hash);
-
- dentry->d_parent = dentry;
- list_del_init(&dentry->d_u.d_child);
- anon->d_parent = dparent;
- list_move(&anon->d_u.d_child, &dparent->d_subdirs);
-
- write_seqcount_end(&dentry->d_seq);
- write_seqcount_end(&anon->d_seq);
-
- dentry_unlock_parents_for_move(anon, dentry);
- spin_unlock(&dentry->d_lock);
-
- /* anon->d_lock still locked, returns locked */
-}
-
/**
* d_splice_alias - splice a disconnected dentry into the tree if one exists
* @inode: the inode which may have a disconnected dentry
@@ -2712,11 +2683,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
return ERR_PTR(-EIO);
}
write_seqlock(&rename_lock);
- __d_materialise_dentry(dentry, new);
+ __d_move(new, dentry, false);
write_sequnlock(&rename_lock);
- __d_drop(new);
- _d_rehash(new);
- spin_unlock(&new->d_lock);
spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
iput(inode);
@@ -2776,9 +2744,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
} else if (IS_ROOT(alias)) {
/* Is this an anonymous mountpoint that we
* could splice into our tree? */
- __d_materialise_dentry(dentry, alias);
+ __d_move(alias, dentry, false);
write_sequnlock(&rename_lock);
- __d_drop(alias);
goto found;
} else {
/* Nope, but we must(!) avoid directory
@@ -2804,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
actual = __d_instantiate_unique(dentry, inode);
if (!actual)
actual = dentry;
- else
- BUG_ON(!d_unhashed(actual));
- spin_lock(&actual->d_lock);
+ d_rehash(actual);
found:
- _d_rehash(actual);
- spin_unlock(&actual->d_lock);
spin_unlock(&inode->i_lock);
out_nolock:
if (actual == dentry) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c3116404ab49..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
ssize_t ret;
- ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
+ ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
&sdio->from);
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
- ep_take_care_of_epollwakeup(&epds);
+ if (ep_op_has_event(op))
+ ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b19760b1de5..b0c225cdb52c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
/*
* Special error return code only used by dx_probe() and its callers.
*/
-#define ERR_BAD_DX_DIR -75000
+#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
/*
* Timeout and state flag for lazy initialization inode thread.
@@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+ int changed = 0;
+
+ if (newsize > inode->i_size) {
+ i_size_write(inode, newsize);
+ changed = 1;
+ }
+ if (newsize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, newsize);
+ changed |= 2;
+ }
+ return changed;
+}
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 76c2df382b7d..74292a71b384 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4665,7 +4665,8 @@ retry:
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
- ext4_lblk_t len, int flags, int mode)
+ ext4_lblk_t len, loff_t new_size,
+ int flags, int mode)
{
struct inode *inode = file_inode(file);
handle_t *handle;
@@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int retries = 0;
struct ext4_map_blocks map;
unsigned int credits;
+ loff_t epos;
map.m_lblk = offset;
+ map.m_len = len;
/*
* Don't normalize the request if it can fit in one extent so
* that it doesn't get unnecessarily split into multiple
@@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
credits = ext4_chunk_trans_blocks(inode, len);
retry:
- while (ret >= 0 && ret < len) {
- map.m_lblk = map.m_lblk + ret;
- map.m_len = len = len - ret;
+ while (ret >= 0 && len) {
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
credits);
if (IS_ERR(handle)) {
@@ -4709,6 +4710,21 @@ retry:
ret2 = ext4_journal_stop(handle);
break;
}
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
+ epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ inode->i_ctime = ext4_current_time(inode);
+ if (new_size) {
+ if (epos > new_size)
+ epos = new_size;
+ if (ext4_update_inode_size(inode, epos) & 0x1)
+ inode->i_mtime = inode->i_ctime;
+ } else {
+ if (epos > inode->i_size)
+ ext4_set_inode_flag(inode,
+ EXT4_INODE_EOFBLOCKS);
+ }
+ ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret2)
break;
@@ -4731,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t new_size = 0;
int ret = 0;
int flags;
- int partial;
+ int credits;
+ int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
struct address_space *mapping = inode->i_mapping;
@@ -4771,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (start < offset || end > offset + len)
return -EINVAL;
- partial = (offset + len) & ((1 << blkbits) - 1);
+ partial_begin = offset & ((1 << blkbits) - 1);
+ partial_end = (offset + len) & ((1 << blkbits) - 1);
lblk = start >> blkbits;
max_blocks = (end >> blkbits);
@@ -4805,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* If we have a partial block after EOF we have to allocate
* the entire block.
*/
- if (partial)
+ if (partial_end)
max_blocks += 1;
}
@@ -4813,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
@@ -4825,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (ret)
goto out_dio;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
- mode);
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+ flags, mode);
if (ret)
goto out_dio;
}
+ if (!partial_begin && !partial_end)
+ goto out_dio;
- handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ /*
+ * In worst case we have to writeout two nonadjacent unwritten
+ * blocks and update the inode
+ */
+ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+ if (ext4_should_journal_data(inode))
+ credits += 2;
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
@@ -4839,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-
if (new_size) {
- if (new_size > i_size_read(inode))
- i_size_write(inode, new_size);
- if (new_size > EXT4_I(inode)->i_disksize)
- ext4_update_i_disksize(inode, new_size);
+ ext4_update_inode_size(inode, new_size);
} else {
/*
* Mark that we allocate beyond EOF so the subsequent truncate
@@ -4853,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
-
ext4_mark_inode_dirty(handle, inode);
/* Zero out partial block at the edges of the range */
@@ -4880,13 +4903,11 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- handle_t *handle;
loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
int flags;
ext4_lblk_t lblk;
- struct timespec tv;
unsigned int blkbits = inode->i_blkbits;
/* Return error if mode is not supported */
@@ -4937,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+ flags, mode);
if (ret)
goto out;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- goto out;
-
- tv = inode->i_ctime = ext4_current_time(inode);
-
- if (new_size) {
- if (new_size > i_size_read(inode)) {
- i_size_write(inode, new_size);
- inode->i_mtime = tv;
- }
- if (new_size > EXT4_I(inode)->i_disksize)
- ext4_update_i_disksize(inode, new_size);
- } else {
- /*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if ((offset + len) > i_size_read(inode))
- ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
}
- ext4_mark_inode_dirty(handle, inode);
- if (file->f_flags & O_SYNC)
- ext4_handle_sync(handle);
-
- ext4_journal_stop(handle);
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 367a60c07cf0..3aa26e9117c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file,
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
-
/*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hole i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
+ * it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
- if (pos + copied > inode->i_size) {
- i_size_write(inode, pos + copied);
- i_size_changed = 1;
- }
-
- if (pos + copied > EXT4_I(inode)->i_disksize) {
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * but greater than i_disksize. (hint delalloc)
- */
- ext4_update_i_disksize(inode, (pos + copied));
- i_size_changed = 1;
- }
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
page_cache_release(page);
@@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
- loff_t new_i_size;
+ int size_changed = 0;
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
- new_i_size = pos + copied;
- if (new_i_size > inode->i_size)
- i_size_write(inode, pos+copied);
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
- unlock_page(page);
- page_cache_release(page);
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -2095,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct ext4_map_blocks *map = &mpd->map;
int err;
loff_t disksize;
+ int progress = 0;
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2111,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
* is non-zero, a commit should free up blocks.
*/
if ((err == -ENOMEM) ||
- (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ (err == -ENOSPC && ext4_count_free_clusters(sb))) {
+ if (progress)
+ goto update_disksize;
return err;
+ }
ext4_msg(sb, KERN_CRIT,
"Delayed block allocation failed for "
"inode %lu at logical offset %llu with"
@@ -2129,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
*give_up_on_write = true;
return err;
}
+ progress = 1;
/*
* Update buffer state, submit mapped pages, and get us new
* extent to map
*/
err = mpage_map_and_submit_buffers(mpd);
if (err < 0)
- return err;
+ goto update_disksize;
} while (map->m_len);
+update_disksize:
/*
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 956027711faf..8b0f9ef517d6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
+ if (WARN_ON(count == 0))
+ return;
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
/* Don't bother if the block group is corrupt. */
@@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
int err;
if (pa == NULL) {
+ if (ac->ac_f_ex.fe_len == 0)
+ return;
err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
if (err) {
/*
@@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
ac->ac_f_ex.fe_len);
ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+ ext4_mb_unload_buddy(&e4b);
return;
}
if (pa->pa_type == MB_INODE_PA)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b147a67baa0d..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
buffer */
int num = 0;
ext4_lblk_t nblocks;
- int i, err;
+ int i, err = 0;
int namelen;
*res_dir = NULL;
@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (bh || (err != ERR_BAD_DX_DIR))
+ if (err == -ENOENT)
+ return NULL;
+ if (err && err != ERR_BAD_DX_DIR)
+ return ERR_PTR(err);
+ if (bh)
return bh;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
@@ -1295,6 +1299,11 @@ restart:
}
num++;
bh = ext4_getblk(NULL, dir, b++, 0, &err);
+ if (unlikely(err)) {
+ if (ra_max == 0)
+ return ERR_PTR(err);
+ break;
+ }
bh_use[ra_max] = bh;
if (bh)
ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return (struct dentry *) bh;
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct buffer_head *bh;
bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+ if (IS_ERR(bh))
+ return (struct dentry *) bh;
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh)
goto end_rmdir;
@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh)
goto end_unlink;
@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
struct ext4_dir_entry_2 *de;
bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (bh) {
retval = ext4_delete_entry(handle, dir, de, bh);
brelse(bh);
@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
return retval;
}
-static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
+ int force_reread)
{
int retval;
/*
@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
ent->de->name_len != ent->dentry->d_name.len ||
strncmp(ent->de->name, ent->dentry->d_name.name,
- ent->de->name_len)) {
+ ent->de->name_len) ||
+ force_reread) {
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
} else {
@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
.dentry = new_dentry,
.inode = new_dentry->d_inode,
};
+ int force_reread;
int retval;
dquot_initialize(old.dir);
@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dquot_initialize(new.inode);
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
+ if (IS_ERR(new.bh)) {
+ retval = PTR_ERR(new.bh);
+ new.bh = NULL;
+ goto end_rename;
+ }
if (new.bh) {
if (!new.inode) {
brelse(new.bh);
@@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
}
+ /*
+ * If we're renaming a file within an inline_data dir and adding or
+ * setting the new dirent causes a conversion from inline_data to
+ * extents/blockmap, we need to force the dirent delete code to
+ * re-read the directory, or else we end up trying to delete a dirent
+ * from what is now the extent tree root (or a block map).
+ */
+ force_reread = (new.dir->i_ino == old.dir->i_ino &&
+ ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
@@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
}
+ if (force_reread)
+ force_reread = !ext4_test_inode_flag(new.dir,
+ EXT4_INODE_INLINE_DATA);
/*
* Like most other Unix systems, set the ctime for inodes on a
@@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
/*
* ok, that's it
*/
- ext4_rename_delete(handle, &old);
+ ext4_rename_delete(handle, &old, force_reread);
if (new.inode) {
ext4_dec_count(handle, new.inode);
@@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
&old.de, &old.inlined);
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
+ if (IS_ERR(new.bh)) {
+ retval = PTR_ERR(new.bh);
+ new.bh = NULL;
+ goto end_rename;
+ }
/* RENAME_EXCHANGE case: old *and* new must both exist */
if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad154b9..0b28b36e7915 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
- /* journal checksum v2 */
+ /* journal checksum v3 */
compat = 0;
- incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
} else {
/* journal checksum v1 */
compat = JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
jbd2_journal_clear_features(sbi->s_journal,
JBD2_FEATURE_COMPAT_CHECKSUM, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ JBD2_FEATURE_INCOMPAT_CSUM_V3 |
JBD2_FEATURE_INCOMPAT_CSUM_V2);
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
mounted as f2fs. Each file shows the whole f2fs information.
/sys/kernel/debug/f2fs/status includes:
- - major file system information managed by f2fs currently
+ - major filesystem information managed by f2fs currently
- average SIT information about whole segments
- current memory footprint consumed by f2fs.
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
bool "F2FS consistency checking feature"
depends on F2FS_FS
help
- Enables BUG_ONs which check the file system consistency in runtime.
+ Enables BUG_ONs which check the filesystem consistency in runtime.
If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5bada52..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
return page;
}
-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ bool readahead = false;
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || (page && !PageUptodate(page)))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ return get_meta_page(sbi, index);
+}
+
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
{
switch (type) {
case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
case META_SSA:
case META_CP:
return 0;
+ case META_POR:
+ return MAX_BLKADDR(sbi);
default:
BUG();
}
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
{
block_t prev_blk_addr = 0;
struct page *page;
- int blkno = start;
- int max_blks = get_max_meta_blks(sbi, type);
+ block_t blkno = start;
+ block_t max_blks = get_max_meta_blks(sbi, type);
struct f2fs_io_info fio = {
.type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
break;
case META_SSA:
case META_CP:
- /* get ssa/cp block addr */
+ case META_POR:
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ if (unlikely(blkno < SEG0_BLKADDR(sbi)))
+ goto out;
blk_addr = blkno;
break;
default:
@@ -151,8 +172,7 @@ out:
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
trace_f2fs_writepage(page, META);
@@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page,
goto redirty_out;
if (wbc->for_reclaim)
goto redirty_out;
-
- /* Should not write any meta pages, if any IO error was occurred */
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
- goto no_write;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
-no_write:
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
return 0;
@@ -180,7 +197,7 @@ redirty_out:
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
trace_f2fs_writepages(mapping->host, wbc, META);
@@ -262,15 +279,12 @@ continue_unlock:
static int f2fs_set_meta_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
trace_f2fs_set_page_dirty(page, META);
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- inc_page_count(sbi, F2FS_DIRTY_META);
+ inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
return 1;
}
return 0;
@@ -348,7 +362,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -381,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- f2fs_bug_on(sbi->n_orphans == 0);
+ f2fs_bug_on(sbi, sbi->n_orphans == 0);
sbi->n_orphans--;
spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
}
@@ -401,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode = f2fs_iget(sbi->sb, ino);
- f2fs_bug_on(IS_ERR(inode));
+ f2fs_bug_on(sbi, IS_ERR(inode));
clear_nlink(inode);
/* truncate all the data during iput */
@@ -446,8 +460,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
unsigned short index;
- unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
- (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+ unsigned short orphan_blocks =
+ (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
struct page *page = NULL;
struct ino_entry *orphan = NULL;
@@ -462,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
list_for_each_entry(orphan, head, list) {
if (!page) {
page = find_get_page(META_MAPPING(sbi), start_blk++);
- f2fs_bug_on(!page);
+ f2fs_bug_on(sbi, !page);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -622,7 +636,7 @@ fail_no_cp:
static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
return -EEXIST;
@@ -634,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
return 0;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+void update_dirty_page(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new;
int ret = 0;
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
+ if (!S_ISDIR(inode->i_mode)) {
+ inode_inc_dirty_pages(inode);
+ goto out;
+ }
+
new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
ret = __add_dirty_inode(inode, new);
- inode_inc_dirty_dents(inode);
- SetPagePrivate(page);
+ inode_inc_dirty_pages(inode);
spin_unlock(&sbi->dir_inode_lock);
if (ret)
kmem_cache_free(inode_entry_slab, new);
+out:
+ SetPagePrivate(page);
}
void add_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new =
f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
int ret = 0;
@@ -677,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
void remove_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_dents(inode) ||
+ if (get_dirty_pages(inode) ||
!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
spin_unlock(&sbi->dir_inode_lock);
return;
@@ -737,7 +757,7 @@ retry:
/*
* Freeze all the FS-operations for checkpoint.
*/
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
@@ -745,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
.for_reclaim = 0,
};
struct blk_plug plug;
+ int err = 0;
blk_start_plug(&plug);
@@ -754,11 +775,15 @@ retry_flush_dents:
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
sync_dirty_dir_inodes(sbi);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_dents;
}
/*
- * POR: we should ensure that there is no dirty node pages
+ * POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush.
*/
retry_flush_nodes:
@@ -767,9 +792,16 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
sync_node_pages(sbi, 0, &wbc);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_unlock_all(sbi);
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_nodes;
}
+out:
blk_finish_plug(&plug);
+ return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -793,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- nid_t last_nid = 0;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
@@ -813,8 +846,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
/* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META))
+ while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+ }
next_free_nid(sbi, &last_nid);
@@ -825,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
ckpt->cur_node_blkoff[i] =
@@ -833,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
ckpt->alloc_type[i + CURSEG_HOT_NODE] =
curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
}
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
ckpt->cur_data_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
ckpt->cur_data_blkoff[i] =
@@ -848,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi);
- if (data_sum_blocks < 3)
+ if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
- orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
- / F2FS_ORPHANS_PER_BLOCK;
+ orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
} else {
clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks);
}
@@ -875,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ if (sbi->need_fsck)
+ set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
/* update SIT/NAT bitmap */
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -909,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
@@ -924,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* wait for previous submitted node/meta pages writeback */
wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -934,27 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
- clear_prefree_segments(sbi);
- release_dirty_inode(sbi);
- F2FS_RESET_SB_DIRT(sbi);
- }
+ release_dirty_inode(sbi);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
}
/*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+ if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+ goto out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto out;
+ if (block_operations(sbi))
+ goto out;
+
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
f2fs_submit_merged_bio(sbi, DATA, WRITE);
f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -970,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* write cached NAT/SIT entries to NAT/SIT area */
flush_nat_entries(sbi);
- flush_sit_entries(sbi);
+ flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, is_umount);
+ do_checkpoint(sbi, cpc);
unblock_operations(sbi);
- mutex_unlock(&sbi->cp_mutex);
-
stat_inc_cp_count(sbi->stat_info);
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
+out:
+ mutex_unlock(&sbi->cp_mutex);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
@@ -999,8 +1048,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
* for cp pack we can have max 1020*504 orphan entries
*/
sbi->n_orphans = 0;
- sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
- * F2FS_ORPHANS_PER_BLOCK;
+ sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+ NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099c51c..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
struct page *page = bvec->bv_page;
if (unlikely(err)) {
- SetPageError(page);
+ set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
}
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio = bio_alloc(GFP_NOIO, npages);
bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
bio->bi_private = sbi;
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int bio_blocks = MAX_BIO_BLOCKS(sbi);
io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
int reserve_new_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int err;
/* if inode_page exists, index should be zero */
- f2fs_bug_on(!need_put && index);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
err = get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
block_t start_blkaddr, end_blkaddr;
int need_update = true;
- f2fs_bug_on(blk_addr == NEW_ADDR);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
@@ -396,7 +396,6 @@ end_update:
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
sync ? READ_SYNC : READA);
if (err)
return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
*/
struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -490,7 +488,8 @@ repeat:
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
return ERR_PTR(err);
@@ -517,7 +516,6 @@ repeat:
struct page *get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct page *page;
struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
goto put_err;
@@ -573,10 +571,12 @@ put_err:
static int __allocate_data_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
block_t new_blkaddr;
struct node_info ni;
+ pgoff_t fofs;
int type;
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
update_extent_cache(new_blkaddr, dn);
clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ /* update i_size */
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+ if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+
dn->data_blkaddr = new_blkaddr;
return 0;
}
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create, bool fiemap)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
goto out;
if (create) {
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ f2fs_lock_op(F2FS_I_SB(inode));
}
/* When reading holes, we need its node page */
@@ -691,7 +696,7 @@ get_next:
allocated = true;
blkaddr = dn.data_blkaddr;
}
- /* Give more consecutive addresses for the read ahead */
+ /* Give more consecutive addresses for the readahead */
if (blkaddr == (bh_result->b_blocknr + ofs)) {
ofs++;
dn.ofs_in_node++;
@@ -707,7 +712,7 @@ put_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create)
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(F2FS_I_SB(inode));
out:
trace_f2fs_get_data_block(inode, iblock, bh_result, err);
return err;
@@ -739,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
trace_f2fs_readpage(page, DATA);
- /* If the file has inline data, try to read it directlly */
+ /* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
else
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
@@ -836,10 +841,19 @@ write:
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
err = do_write_data_page(page, &fio);
goto done;
}
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ SetPageError(page);
+ unlock_page(page);
+ goto out;
+ }
+
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0))
@@ -857,7 +871,7 @@ done:
clear_cold_data(page);
out:
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
unlock_page(page);
if (need_balance_fs)
f2fs_balance_fs(sbi);
@@ -883,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
bool locked = false;
int ret;
long diff;
@@ -895,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return 0;
if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
- get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+ get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
@@ -917,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return ret;
skip_write:
- wbc->pages_skipped += get_dirty_dents(inode);
+ wbc->pages_skipped += get_dirty_pages(inode);
return 0;
}
@@ -927,7 +941,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size);
+ truncate_blocks(inode, inode->i_size, true);
}
}
@@ -936,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
@@ -946,7 +960,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
f2fs_balance_fs(sbi);
repeat:
- err = f2fs_convert_inline_data(inode, pos + len);
+ err = f2fs_convert_inline_data(inode, pos + len, NULL);
if (err)
goto fail;
@@ -1038,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
- set_page_dirty(page);
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ register_inmem_page(inode, page);
+ else
+ set_page_dirty(page);
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
@@ -1083,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
if (check_direct_IO(inode, rw, iter, offset))
return 0;
- /* clear fsync mark to recover these blocks */
- fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
-
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1101,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
+
+ if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+ return;
+
if (PageDirty(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
ClearPagePrivate(page);
}
@@ -1124,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- set_dirty_dir_page(inode, page);
+ update_dirty_page(inode, page);
return 1;
}
return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba33be11..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
struct f2fs_stat_info *si = F2FS_STAT(sbi);
int i;
- /* valid check of the segment numbers */
+ /* validation check of the segment numbers */
si->hit_ext = sbi->read_hit_ext;
si->total_ext = sbi->total_hit_ext;
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+ dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build sit */
si->base_mem += sizeof(struct sit_info);
- si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+ si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
if (sbi->segs_per_sec > 1)
- si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
+ si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
/* build free segmap */
si->base_mem += sizeof(struct free_segmap_info);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
- si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
- /* buld nm */
+ /* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c3d903..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,9 +124,9 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
/*
* For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs are occurred.
+ * We stop here for figuring out where the bugs has occurred.
*/
- f2fs_bug_on(!de->name_len);
+ f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
bool room = false;
int max_slots = 0;
- f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
+ f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
int update_dent_inode(struct inode *inode, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
- page = get_node_page(sbi, inode->i_ino);
+ page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct page *page;
int err;
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
if (err)
goto put_error;
} else {
- page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
* we should remove this inode from orphan list.
*/
if (inode->i_nlink == 0)
- remove_orphan_inode(sbi, inode->i_ino);
+ remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
inc_nlink(inode);
}
return page;
@@ -391,7 +389,7 @@ put_error:
error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0);
+ truncate_blocks(inode, 0, false);
remove_dirty_dir_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
@@ -563,7 +561,7 @@ fail:
}
/*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
{
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
+ struct inode *dir = page->mapping->host;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (inode) {
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
- inode_dec_dirty_dents(dir);
+ inode_dec_dirty_pages(dir);
}
f2fs_put_page(page, 1);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab5338a97a..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
#include <linux/sched.h>
#ifdef CONFIG_F2FS_CHECK_FS
-#define f2fs_bug_on(condition) BUG_ON(condition)
+#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(sbi, condition) \
+ do { \
+ if (unlikely(condition)) { \
+ WARN_ON(1); \
+ sbi->need_fsck = true; \
+ } \
+ } while (0)
#define f2fs_down_write(x, y) down_write(x)
#endif
@@ -90,6 +96,20 @@ enum {
SIT_BITMAP
};
+enum {
+ CP_UMOUNT,
+ CP_SYNC,
+ CP_DISCARD,
+};
+
+struct cp_control {
+ int reason;
+ __u64 trim_start;
+ __u64 trim_end;
+ __u64 trim_minlen;
+ __u64 trimmed;
+};
+
/*
* For CP/NAT/SIT/SSA readahead
*/
@@ -97,7 +117,8 @@ enum {
META_CP,
META_NAT,
META_SIT,
- META_SSA
+ META_SSA,
+ META_POR,
};
/* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
struct fsync_inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
- block_t blkaddr; /* block address locating the last inode */
+ block_t blkaddr; /* block address locating the last fsync */
+ block_t last_dentry; /* block address locating the last dentry */
+ block_t last_inode; /* block address locating the last inode */
};
#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
{
int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
return before;
}
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+ int type)
+{
+ if (type == NAT_JOURNAL)
+ return size <= MAX_NAT_JENTRIES(sum);
+ return size <= MAX_SIT_JENTRIES(sum);
+}
+
/*
* ioctl commands
*/
-#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+
+#define F2FS_IOCTL_MAGIC 0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- atomic_t dirty_dents; /* # of dirty dentry pages */
+ atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+
+ struct list_head inmem_pages; /* inmemory pages managed by f2fs */
+ struct mutex inmem_lock; /* lock for inmemory pages */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
+ struct radix_tree_root nat_set_root;/* root of the nat set cache */
rwlock_t nat_tree_lock; /* protect nat_tree_lock */
- unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */
- struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
- struct list_head nat_entry_set; /* nat entry set list */
+ unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
/* free node ids management */
@@ -332,18 +373,16 @@ enum {
};
struct flush_cmd {
- struct flush_cmd *next;
struct completion wait;
+ struct llist_node llnode;
int ret;
};
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- struct flush_cmd *issue_list; /* list for command issue */
- struct flush_cmd *dispatch_list; /* list for command dispatch */
- spinlock_t issue_lock; /* for issue list lock */
- struct flush_cmd *issue_tail; /* list tail of issue list */
+ struct llist_head issue_list; /* list for command issue */
+ struct llist_node *dispatch_list; /* list for command dispatch */
};
struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
int nr_discards; /* # of discards in the list */
int max_discards; /* max. discards to be issued */
+ struct list_head sit_entry_set; /* sit entry set list */
+
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
+ unsigned int min_fsync_blocks; /* threshold for fsync */
/* for flush command control */
struct flush_cmd_control *cmd_control_info;
@@ -395,7 +437,7 @@ enum count_type {
};
/*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
* The available types are:
* DATA User data pages. It operates as async mode.
* NODE Node pages. It operates as async mode.
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int s_dirty; /* dirty flag for checkpoint */
+ bool need_fsck; /* need fsck.f2fs to fix */
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
@@ -470,7 +513,7 @@ struct f2fs_sb_info {
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- /* basic file system units */
+ /* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
unsigned int blocksize; /* block size */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+ return F2FS_SB(inode->i_sb);
+}
+
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+ return F2FS_I_SB(mapping->host);
+}
+
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+ return F2FS_M_SB(page->mapping);
+}
+
static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
{
return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t count)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
- f2fs_bug_on(inode->i_blocks < count);
+ f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
+ f2fs_bug_on(sbi, inode->i_blocks < count);
inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
F2FS_SET_SB_DIRT(sbi);
}
-static inline void inode_inc_dirty_dents(struct inode *inode)
+static inline void inode_inc_dirty_pages(struct inode *inode)
{
- inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_inc(&F2FS_I(inode)->dirty_dents);
+ atomic_inc(&F2FS_I(inode)->dirty_pages);
+ if (S_ISDIR(inode->i_mode))
+ inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
atomic_dec(&sbi->nr_pages[count_type]);
}
-static inline void inode_dec_dirty_dents(struct inode *inode)
+static inline void inode_dec_dirty_pages(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
- dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_dec(&F2FS_I(inode)->dirty_dents);
+ atomic_dec(&F2FS_I(inode)->dirty_pages);
+
+ if (S_ISDIR(inode->i_mode))
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
-static inline int get_dirty_dents(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
{
- return atomic_read(&F2FS_I(inode)->dirty_dents);
+ return atomic_read(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -799,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
/*
* odd numbered checkpoint should at cp segment 0
- * and even segent must be at cp segment 1
+ * and even segment must be at cp segment 1
*/
if (!(ckpt_version & 1))
start_addr += sbi->blocks_per_seg;
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_block_count);
- f2fs_bug_on(!sbi->total_valid_node_count);
- f2fs_bug_on(!inode->i_blocks);
+ f2fs_bug_on(sbi, !sbi->total_valid_block_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ f2fs_bug_on(sbi, !inode->i_blocks);
inode->i_blocks--;
sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
+ f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
sbi->total_valid_inode_count++;
spin_unlock(&sbi->stat_lock);
}
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_inode_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
sbi->total_valid_inode_count--;
spin_unlock(&sbi->stat_lock);
}
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
return;
if (unlock) {
- f2fs_bug_on(!PageLocked(page));
+ f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
FI_INLINE_DATA, /* used for inline data*/
FI_APPEND_WRITE, /* inode has appended data */
FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used fo ipu for fdatasync */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_VOLATILE_FILE, /* indicate volatile file */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
+
static inline void *inline_data_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1096,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb)
return sb->s_flags & MS_RDONLY;
}
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+ return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
+
static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
{
set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
void f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1136,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
void update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
/*
* namei.c
@@ -1183,9 +1262,9 @@ struct dnode_of_data;
struct node_info;
bool available_free_memory(struct f2fs_sb_info *, int);
-int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
-void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1202,10 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, struct node_info *, block_t);
void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
int restore_node_summary(struct f2fs_sb_info *, unsigned int,
struct f2fs_summary_block *);
@@ -1218,6 +1295,8 @@ void destroy_node_manager_caches(void);
/*
* segment.c
*/
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1226,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
void discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1238,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
void recover_data_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, block_t, block_t);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1247,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
int __init create_segment_manager_caches(void);
@@ -1258,10 +1337,12 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1269,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
void recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
-void set_dirty_dir_page(struct inode *, struct page *);
+void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1357,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode++); \
+ ((F2FS_I_SB(inode))->inline_inode++); \
} while (0)
#define stat_dec_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode--); \
+ ((F2FS_I_SB(inode))->inline_inode--); \
} while (0)
#define stat_inc_seg_type(sbi, curseg) \
@@ -1439,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
*/
bool f2fs_may_inline(struct inode *);
int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9bd569..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
int err;
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
sb_start_pagefault(inode->i_sb);
+ /* force to convert with normal data indices */
+ err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+ if (err)
+ goto out;
+
/* block allocation */
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
return 1;
}
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ bool need_cp = false;
+
+ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+ need_cp = true;
+ else if (file_wrong_pino(inode))
+ need_cp = true;
+ else if (!space_for_roll_forward(sbi))
+ need_cp = true;
+ else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+ need_cp = true;
+ else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+ need_cp = true;
+
+ return need_cp;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ nid_t ino = inode->i_ino;
int ret = 0;
bool need_cp = false;
struct writeback_control wbc = {
@@ -129,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (datasync)
+ if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
-
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (datasync)
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(fi, FI_NEED_IPU);
+
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
@@ -144,33 +168,31 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* if there is no written data, don't waste time to write recovery info.
*/
if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
- !exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+ !exist_written_data(sbi, ino, APPEND_INO)) {
+ struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+
+ /* But we need to avoid that there are some inode updates */
+ if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
+ f2fs_put_page(i, 0);
+ goto go_write;
+ }
+ f2fs_put_page(i, 0);
+
if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
- exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+ exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
}
-
+go_write:
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
- down_read(&fi->i_sem);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
- need_cp = true;
- else if (file_wrong_pino(inode))
- need_cp = true;
- else if (!space_for_roll_forward(sbi))
- need_cp = true;
- else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
- need_cp = true;
- else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
- need_cp = true;
-
+ down_read(&fi->i_sem);
+ need_cp = need_do_checkpoint(inode);
up_read(&fi->i_sem);
if (need_cp) {
@@ -194,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
up_write(&fi->i_sem);
}
} else {
- /* if there is no written node page, write its inode page */
- while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
- if (fsync_mark_done(sbi, inode->i_ino))
- goto out;
+sync_nodes:
+ sync_node_pages(sbi, ino, &wbc);
+
+ if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
+ goto sync_nodes;
}
- ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+
+ ret = wait_on_node_pages_writeback(sbi, ino);
if (ret)
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+ remove_dirty_inode(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ remove_dirty_inode(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
+ ret = f2fs_issue_flush(F2FS_I_SB(inode));
}
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -288,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
- /* direct node is not exist */
+ /* direct node does not exists */
if (whence == SEEK_DATA) {
pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
F2FS_I(inode));
@@ -340,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_DATA:
case SEEK_HOLE:
+ if (offset < 0)
+ return -ENXIO;
return f2fs_seek_block(file, offset, whence);
}
@@ -356,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
__le32 *addr;
@@ -417,9 +443,9 @@ out:
f2fs_put_page(page, 1);
}
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
@@ -433,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from)
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- f2fs_lock_op(sbi);
+ if (lock)
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -448,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from)
count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
count -= dn.ofs_in_node;
- f2fs_bug_on(count < 0);
+ f2fs_bug_on(sbi, count < 0);
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
truncate_data_blocks_range(&dn, count);
@@ -458,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
done:
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
@@ -475,7 +504,7 @@ void f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
- if (!truncate_blocks(inode, i_size_read(inode))) {
+ if (!truncate_blocks(inode, i_size_read(inode), true)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
@@ -531,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- err = f2fs_convert_inline_data(inode, attr->ia_size);
+ if (attr->ia_valid & ATTR_SIZE) {
+ err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
if (err)
return err;
- truncate_setsize(inode, attr->ia_size);
- f2fs_truncate(inode);
- f2fs_balance_fs(F2FS_SB(inode->i_sb));
+ if (attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ } else {
+ /*
+ * giving a chance to truncate blocks past EOF which
+ * are fallocated with FALLOC_FL_KEEP_SIZE.
+ */
+ f2fs_truncate(inode);
+ }
}
__setattr_copy(inode, attr);
@@ -573,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
static void fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
if (!len)
@@ -622,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t off_start, off_end;
int ret = 0;
- ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ /* skip punching hole beyond i_size */
+ if (offset >= inode->i_size)
+ return ret;
+
+ ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
if (ret)
return ret;
@@ -645,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
f2fs_balance_fs(sbi);
@@ -666,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t index, pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_start, off_end;
@@ -678,7 +721,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (ret)
return ret;
- ret = f2fs_convert_inline_data(inode, offset + len);
+ ret = f2fs_convert_inline_data(inode, offset + len, NULL);
if (ret)
return ret;
@@ -762,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *)arg);
+}
+
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags;
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int oldflags;
int ret;
- switch (cmd) {
- case F2FS_IOC_GETFLAGS:
- flags = fi->i_flags & FS_FL_USER_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case F2FS_IOC_SETFLAGS:
- {
- unsigned int oldflags;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
+ if (get_user(flags, (int __user *)arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
+
+ oldflags = fi->i_flags;
- if (get_user(flags, (int __user *) arg)) {
- ret = -EFAULT;
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
goto out;
}
+ }
- flags = f2fs_mask_flags(inode->i_mode, flags);
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ fi->i_flags = flags;
+ mutex_unlock(&inode->i_mutex);
- mutex_lock(&inode->i_mutex);
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
- oldflags = fi->i_flags;
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- ret = -EPERM;
- goto out;
- }
- }
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ f2fs_balance_fs(sbi);
- flags = flags & FS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
- fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- f2fs_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
-out:
- mnt_drop_write_file(filp);
+ return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+}
+
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (f2fs_is_volatile_file(inode))
+ return 0;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
return ret;
- }
+
+ if (f2fs_is_atomic_file(inode))
+ commit_inmem_pages(inode, false);
+
+ ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ return 0;
+}
+
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+ return 0;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case F2FS_IOC_GETFLAGS:
+ return f2fs_ioc_getflags(filp, arg);
+ case F2FS_IOC_SETFLAGS:
+ return f2fs_ioc_setflags(filp, arg);
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ return f2fs_ioc_start_atomic_write(filp);
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ return f2fs_ioc_commit_atomic_write(filp);
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ return f2fs_ioc_start_volatile_write(filp);
+ case FITRIM:
+ return f2fs_ioc_fitrim(filp, arg);
default:
return -ENOTTY;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d90ccc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
- * Note) We have to avoid triggering GCs too much frequently.
+ * Note) We have to avoid triggering GCs frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
- for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
+ for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
clear_bit(secno, dirty_i->victim_secmap);
@@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
- /* Handle if the system time is changed by user */
+ /* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int secno, max_cost;
int nsearched = 0;
+ mutex_lock(&dirty_i->seglist_lock);
+
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
- mutex_lock(&dirty_i->seglist_lock);
-
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap,
- TOTAL_SEGS(sbi), p.offset);
- if (segno >= TOTAL_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
+ if (segno >= MAIN_SEGS(sbi)) {
if (sbi->last_victim[p.gc_mode]) {
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
if (IS_ERR(node_page))
continue;
+ /* block may become invalid during get_node_page */
+ if (check_valid_map(sbi, segno, off) == 0) {
+ f2fs_put_page(node_page, 1);
+ continue;
+ }
+
/* set page dirty and write it */
if (gc_type == FG_GC) {
f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
f2fs_wait_on_page_writeback(page, DATA);
if (clear_page_dirty_for_io(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
set_cold_data(page);
do_write_data_page(page, &fio);
clear_cold_data(page);
@@ -593,7 +598,7 @@ next_step:
if (phase == 2) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode))
continue;
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -688,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
int gc_type = BG_GC;
int nfree = 0;
int ret = -1;
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
INIT_LIST_HEAD(&ilist);
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ if (unlikely(f2fs_cp_error(sbi)))
goto stop;
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
gc_type = FG_GC;
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
}
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
goto gc_more;
if (gc_type == FG_GC)
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
stop:
mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
block_t invalid_user_blocks = sbi->user_block_count -
written_block_count(sbi);
/*
- * Background GC is triggered with the following condition.
+ * Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf7281..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
buf[1] += b1;
}
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+ unsigned int *buf, int num)
{
unsigned pad, val;
int i;
@@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
{
__u32 hash;
f2fs_hash_t f2fs_hash;
- const char *p;
+ const unsigned char *p;
__u32 in[8], buf[4];
- const char *name = name_info->name;
+ const unsigned char *name = name_info->name;
size_t len = name_info->len;
if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beeccef9ae1..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
bool f2fs_may_inline(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t nr_blocks;
loff_t i_size;
- if (!test_opt(sbi, INLINE_DATA))
+ if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+ return false;
+
+ if (f2fs_is_atomic_file(inode))
return false;
nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
void *src_addr, *dst_addr;
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
goto out;
}
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
unlock_page(page);
return PTR_ERR(ipage);
@@ -68,12 +69,12 @@ out:
static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
{
- int err;
+ int err = 0;
struct page *ipage;
struct dnode_of_data dn;
void *src_addr, *dst_addr;
block_t new_blk_addr;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_io_info fio = {
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
@@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
goto out;
}
+ /* someone else converted inline_data already */
+ if (!f2fs_has_inline_data(inode))
+ goto out;
+
/*
* i_addr[0] is not used for inline data,
* so reserving new block will not destroy inline data
@@ -124,9 +129,10 @@ out:
return err;
}
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+ struct page *page)
{
- struct page *page;
+ struct page *new_page = page;
int err;
if (!f2fs_has_inline_data(inode))
@@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
else if (to_size <= MAX_INLINE_DATA)
return 0;
- page = grab_cache_page(inode->i_mapping, 0);
- if (!page)
- return -ENOMEM;
+ if (!page || page->index != 0) {
+ new_page = grab_cache_page(inode->i_mapping, 0);
+ if (!new_page)
+ return -ENOMEM;
+ }
- err = __f2fs_convert_inline_data(inode, page);
- f2fs_put_page(page, 1);
+ err = __f2fs_convert_inline_data(inode, new_page);
+ if (!page || page->index != 0)
+ f2fs_put_page(new_page, 1);
return err;
}
int f2fs_write_inline_data(struct inode *inode,
- struct page *page, unsigned size)
+ struct page *page, unsigned size)
{
void *src_addr, *dst_addr;
struct page *ipage;
@@ -181,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
void truncate_inline_data(struct inode *inode, u64 from)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
if (from >= MAX_INLINE_DATA)
return;
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return;
@@ -199,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from)
f2fs_put_page(ipage, 1);
}
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
void *src_addr, *dst_addr;
struct page *ipage;
@@ -218,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage)
ri = F2FS_INODE(npage);
if (f2fs_has_inline_data(inode) &&
- ri && ri->i_inline & F2FS_INLINE_DATA) {
+ ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
@@ -230,22 +238,22 @@ process_inline:
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- return -1;
+ return true;
}
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
zero_user_segment(ipage, INLINE_DATA_OFFSET,
INLINE_DATA_OFFSET + MAX_INLINE_DATA);
clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
- truncate_blocks(inode, 0);
+ } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+ truncate_blocks(inode, 0, false);
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
goto process_inline;
}
- return 0;
+ return false;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
static int do_read_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
void update_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
retry:
node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
void f2fs_evict_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ commit_inmem_pages(inode, true);
+
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
inode->i_ino == F2FS_META_INO(sbi))
goto out_clear;
- f2fs_bug_on(get_dirty_dents(inode));
+ f2fs_bug_on(sbi, get_dirty_pages(inode));
remove_dirty_dir_inode(inode);
if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
out_clear:
clear_inode(inode);
}
+
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ clear_nlink(inode);
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+
+ i_size_write(inode, 0);
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ remove_inode_page(inode);
+ stat_dec_inline_inode(inode);
+
+ alloc_nid_failed(sbi, inode->i_ino);
+ f2fs_unlock_op(sbi);
+
+ /* iput will drop the inode object */
+ iput(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b03776ffd2..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
struct inode *inode;
bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
nid_t ino = 0;
int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, ino);
@@ -133,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, ino);
+ handle_failed_inode(inode);
return err;
}
@@ -145,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
f2fs_balance_fs(sbi);
@@ -156,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
iput(inode);
+ f2fs_unlock_op(sbi);
return err;
}
@@ -205,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode = dentry->d_inode;
struct f2fs_dir_entry *de;
struct page *page;
@@ -229,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
+ /* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
fail:
trace_f2fs_unlink_exit(inode, err);
@@ -239,7 +236,7 @@ fail:
static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
int err;
@@ -255,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
err = page_symlink(inode, symname, symlen);
alloc_nid_done(sbi, inode->i_ino);
@@ -266,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
unlock_new_inode(inode);
return err;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -294,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out_fail;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
@@ -307,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
@@ -326,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err = 0;
@@ -344,27 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page;
@@ -488,8 +473,7 @@ out:
static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct super_block *sb = old_dir->i_sb;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page, *new_dir_page;
@@ -650,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -686,12 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
release_out:
release_orphan_inode(sbi);
out:
- f2fs_unlock_op(sbi);
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
@@ -704,7 +683,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
.mkdir = f2fs_mkdir,
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
- .rename = f2fs_rename,
.rename2 = f2fs_rename2,
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d284631..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
unsigned int long flags;
if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
spin_unlock_irqrestore(&mapping->tree_lock, flags);
clear_page_dirty_for_io(page);
- dec_page_count(sbi, F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
}
ClearPageUptodate(page);
}
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
/* get current nat block page with lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(PageDirty(src_page));
+ f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
kmem_cache_free(nat_entry_slab, e);
}
-int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+ struct nat_entry_set *head;
+
+ if (get_nat_flag(ne, IS_DIRTY))
+ return;
+retry:
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (!head) {
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+ INIT_LIST_HEAD(&head->entry_list);
+ INIT_LIST_HEAD(&head->set_list);
+ head->set = set;
+ head->entry_cnt = 0;
+
+ if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+ cond_resched();
+ goto retry;
+ }
+ }
+ list_move_tail(&ne->list, &head->entry_list);
+ nm_i->dirty_nat_cnt++;
+ head->entry_cnt++;
+ set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ struct nat_entry_set *head;
+
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (head) {
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ head->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
+ }
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+ start, nr);
+}
+
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- int is_cp = 1;
+ bool is_cp = true;
read_lock(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (e && !e->checkpointed)
- is_cp = 0;
+ if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ is_cp = false;
read_unlock(&nm_i->nat_tree_lock);
return is_cp;
}
-bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool fsync_done = false;
+ bool fsynced = false;
read_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- fsync_done = e->fsync_done;
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
+ fsynced = true;
read_unlock(&nm_i->nat_tree_lock);
- return fsync_done;
+ return fsynced;
}
-void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
+ bool need_update = true;
- write_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- e->fsync_done = false;
- write_unlock(&nm_i->nat_tree_lock);
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
+ (get_nat_flag(e, IS_CHECKPOINTED) ||
+ get_nat_flag(e, HAS_FSYNCED_INODE)))
+ need_update = false;
+ read_unlock(&nm_i->nat_tree_lock);
+ return need_update;
}
static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
}
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
- new->checkpointed = true;
+ nat_reset_flag(new);
list_add_tail(&new->list, &nm_i->nat_entries);
nm_i->nat_cnt++;
return new;
@@ -216,7 +270,7 @@ retry:
goto retry;
}
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr == NEW_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
* when nid is reallocated,
@@ -224,20 +278,20 @@ retry:
* So, reinitialize it with new information.
*/
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
/* sanity check */
- f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
- f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
new_blkaddr == NULL_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
new_blkaddr == NEW_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
nat_get_blkaddr(e) != NULL_ADDR &&
new_blkaddr == NEW_ADDR);
- /* increament version no as node is removed */
+ /* increment version no as node is removed */
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
@@ -245,12 +299,17 @@ retry:
/* change address */
nat_set_blkaddr(e, new_blkaddr);
+ if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+ set_nat_flag(e, IS_CHECKPOINTED, false);
__set_nat_cache_dirty(nm_i, e);
/* update fsync_mark if its inode nat entry is still alive */
e = __lookup_nat_cache(nm_i, ni->ino);
- if (e)
- e->fsync_done = fsync_done;
+ if (e) {
+ if (fsync_done && ni->nid == ni->ino)
+ set_nat_flag(e, HAS_FSYNCED_INODE, true);
+ set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+ }
write_unlock(&nm_i->nat_tree_lock);
}
@@ -274,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
/*
- * This function returns always success
+ * This function always returns success
*/
void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
@@ -411,7 +470,7 @@ got:
*/
int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct page *npage[4];
struct page *parent;
int offset[4];
@@ -504,15 +563,15 @@ release_out:
static void truncate_node(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info ni;
get_node_info(sbi, dn->nid, &ni);
if (dn->inode->i_blocks == 0) {
- f2fs_bug_on(ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
goto invalidate;
}
- f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
/* Deallocate node address */
invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
static int truncate_dnode(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *page;
if (dn->nid == 0)
return 1;
/* get direct node */
- page = get_node_page(sbi, dn->nid);
+ page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
return 1;
else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
int ofs, int depth)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct dnode_of_data rdn = *dn;
struct page *page;
struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- page = get_node_page(sbi, dn->nid);
+ page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page)) {
trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
static int truncate_partial_nodes(struct dnode_of_data *dn,
struct f2fs_inode *ri, int *offset, int depth)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *pages[2];
nid_t nid[3];
nid_t child_nid;
@@ -650,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
- /* refernece count'll be increased */
- pages[i] = get_node_page(sbi, nid[i]);
+ /* reference count'll be increased */
+ pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
if (IS_ERR(pages[i])) {
err = PTR_ERR(pages[i]);
idx = i - 1;
@@ -696,7 +752,7 @@ fail:
*/
int truncate_inode_blocks(struct inode *inode, pgoff_t from)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err = 0, cont = 1;
int level, offset[4], noffset[4];
unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
int truncate_xattr_node(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t nid = F2FS_I(inode)->i_xattr_nid;
struct dnode_of_data dn;
struct page *npage;
@@ -823,22 +879,27 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
*/
void remove_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct page *page;
- nid_t ino = inode->i_ino;
struct dnode_of_data dn;
- page = get_node_page(sbi, ino);
- if (IS_ERR(page))
+ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+ if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
return;
- if (truncate_xattr_node(inode, page)) {
- f2fs_put_page(page, 1);
+ if (truncate_xattr_node(inode, dn.inode_page)) {
+ f2fs_put_dnode(&dn);
return;
}
- /* 0 is possible, after f2fs_new_inode() is failed */
- f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
- set_new_dnode(&dn, inode, page, page, ino);
+
+ /* remove potential inline_data blocks */
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode))
+ truncate_data_blocks_range(&dn, 1);
+
+ /* 0 is possible, after f2fs_new_inode() has failed */
+ f2fs_bug_on(F2FS_I_SB(inode),
+ inode->i_blocks != 0 && inode->i_blocks != 1);
+
+ /* will put inode & node pages */
truncate_node(&dn);
}
@@ -856,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
struct page *new_node_page(struct dnode_of_data *dn,
unsigned int ofs, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info old_ni, new_ni;
struct page *page;
int err;
@@ -876,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
get_node_info(sbi, dn->nid, &old_ni);
/* Reinitialize old_ni with new node page */
- f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
new_ni = old_ni;
new_ni.ino = dn->inode->i_ino;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -914,7 +975,7 @@ fail:
*/
static int read_node_page(struct page *page, int rw)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
get_node_info(sbi, page->index, &ni);
@@ -990,7 +1051,7 @@ got_it:
*/
struct page *get_node_page_ra(struct page *parent, int start)
{
- struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
struct blk_plug plug;
struct page *page;
int err, i, end;
@@ -1120,17 +1181,24 @@ continue_unlock:
/* called by fsync() */
if (ino && IS_DNODE(page)) {
- int mark = !is_checkpointed_node(sbi, ino);
set_fsync_mark(page, 1);
- if (IS_INODE(page))
- set_dentry_mark(page, mark);
+ if (IS_INODE(page)) {
+ if (!is_checkpointed_node(sbi, ino) &&
+ !has_fsynced_inode(sbi, ino))
+ set_dentry_mark(page, 1);
+ else
+ set_dentry_mark(page, 0);
+ }
nwritten++;
} else {
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
}
- NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
- wrote++;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+ unlock_page(page);
+ else
+ wrote++;
if (--wbc->nr_to_write == 0)
break;
@@ -1199,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
static int f2fs_write_node_page(struct page *page,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
nid_t nid;
block_t new_addr;
struct node_info ni;
@@ -1212,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page,
if (unlikely(sbi->por_doing))
goto redirty_out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, NODE);
/* get old block addr of this node page */
nid = nid_of_node(page);
- f2fs_bug_on(page->index != nid);
+ f2fs_bug_on(sbi, page->index != nid);
get_node_info(sbi, nid, &ni);
@@ -1248,7 +1318,7 @@ redirty_out:
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff;
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1273,15 +1343,12 @@ skip_write:
static int f2fs_set_node_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
trace_f2fs_set_page_dirty(page, NODE);
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- inc_page_count(sbi, F2FS_DIRTY_NODES);
+ inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
return 1;
}
@@ -1292,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
if (PageDirty(page))
- dec_page_count(sbi, F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
ClearPagePrivate(page);
}
@@ -1347,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
read_lock(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne &&
- (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
+ (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
read_unlock(&nm_i->nat_tree_lock);
if (allocated)
@@ -1404,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
break;
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
- f2fs_bug_on(blk_addr == NEW_ADDR);
+ f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
if (blk_addr == NULL_ADDR) {
if (add_free_nid(sbi, start_nid, true) < 0)
break;
@@ -1474,12 +1541,12 @@ retry:
/* We should not use stale free nids created by build_free_nids */
if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(list_empty(&nm_i->free_nid_list));
+ f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
break;
- f2fs_bug_on(i->state != NID_NEW);
+ f2fs_bug_on(sbi, i->state != NID_NEW);
*nid = i->nid;
i->state = NID_ALLOC;
nm_i->fcnt--;
@@ -1505,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(!i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
__del_from_free_nid_list(nm_i, i);
spin_unlock(&nm_i->free_nid_list_lock);
@@ -1526,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(!i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
if (!available_free_memory(sbi, FREE_NIDS)) {
__del_from_free_nid_list(nm_i, i);
need_free = true;
@@ -1540,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
kmem_cache_free(free_nid_slab, i);
}
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_summary *sum, struct node_info *ni,
- block_t new_blkaddr)
-{
- rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
- set_node_addr(sbi, ni, new_blkaddr, false);
- clear_node_page_dirty(page);
-}
-
void recover_inline_xattr(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
void *src_addr, *dst_addr;
size_t inline_size;
struct page *ipage;
struct f2fs_inode *ri;
- if (!f2fs_has_inline_xattr(inode))
- return;
-
- if (!IS_INODE(page))
- return;
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
ri = F2FS_INODE(page);
- if (!(ri->i_inline & F2FS_INLINE_XATTR))
- return;
-
- ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ goto update_inode;
+ }
dst_addr = inline_xattr_addr(ipage);
src_addr = inline_xattr_addr(page);
@@ -1576,28 +1629,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
f2fs_wait_on_page_writeback(ipage, NODE);
memcpy(dst_addr, src_addr, inline_size);
-
+update_inode:
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
}
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
nid_t new_xnid = nid_of_node(page);
struct node_info ni;
- if (!f2fs_has_xattr_block(ofs_of_node(page)))
- return false;
-
/* 1: invalidate the previous xattr nid */
if (!prev_xnid)
goto recover_xnid;
/* Deallocate node address */
get_node_info(sbi, prev_xnid, &ni);
- f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
invalidate_blocks(sbi, ni.blk_addr);
dec_valid_node_count(sbi, inode);
set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1605,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
recover_xnid:
/* 2: allocate new xattr nid */
if (unlikely(!inc_valid_node_count(sbi, inode)))
- f2fs_bug_on(1);
+ f2fs_bug_on(sbi, 1);
remove_free_nid(NM_I(sbi), new_xnid);
get_node_info(sbi, new_xnid, &ni);
@@ -1618,7 +1668,6 @@ recover_xnid:
set_node_addr(sbi, &ni, blkaddr, false);
update_inode_page(inode);
- return true;
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (!ipage)
return -ENOMEM;
- /* Should not use this inode from free nid list */
+ /* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
SetPageUptodate(ipage);
@@ -1651,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
dst->i_xattr_nid = 0;
+ dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
new_ni = old_ni;
new_ni.ino = ino;
@@ -1659,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return 0;
}
/*
* ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
*/
static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
int start, int nrpages)
@@ -1697,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum_entry;
struct inode *inode = sbi->sb->s_bdev->bd_inode;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int bio_blocks = MAX_BIO_BLOCKS(sbi);
struct page *pages[bio_blocks];
int i, idx, last_offset, nrpages, err = 0;
@@ -1709,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
nrpages = min(last_offset - i, bio_blocks);
- /* read ahead node pages */
+ /* readahead node pages */
nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
if (!nrpages)
return -ENOMEM;
@@ -1739,89 +1790,6 @@ skip:
return err;
}
-static struct nat_entry_set *grab_nat_entry_set(void)
-{
- struct nat_entry_set *nes =
- f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-
- nes->entry_cnt = 0;
- INIT_LIST_HEAD(&nes->set_list);
- INIT_LIST_HEAD(&nes->entry_list);
- return nes;
-}
-
-static void release_nat_entry_set(struct nat_entry_set *nes,
- struct f2fs_nm_info *nm_i)
-{
- f2fs_bug_on(!list_empty(&nes->entry_list));
-
- nm_i->dirty_nat_cnt -= nes->entry_cnt;
- list_del(&nes->set_list);
- kmem_cache_free(nat_entry_set_slab, nes);
-}
-
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
- struct list_head *head)
-{
- struct nat_entry_set *next = nes;
-
- if (list_is_last(&nes->set_list, head))
- return;
-
- list_for_each_entry_continue(next, head, set_list)
- if (nes->entry_cnt <= next->entry_cnt)
- break;
-
- list_move_tail(&nes->set_list, &next->set_list);
-}
-
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
- struct nat_entry_set *nes;
- nid_t start_nid = START_NID(ne->ni.nid);
-
- list_for_each_entry(nes, head, set_list) {
- if (nes->start_nid == start_nid) {
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- adjust_nat_entry_set(nes, head);
- return;
- }
- }
-
- nes = grab_nat_entry_set();
-
- nes->start_nid = start_nid;
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- list_add(&nes->set_list, head);
-}
-
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct list_head *dirty_list = &nm_i->dirty_nat_entries;
- struct list_head *set_list = &nm_i->nat_entry_set;
- struct nat_entry *ne, *tmp;
-
- write_lock(&nm_i->nat_tree_lock);
- list_for_each_entry_safe(ne, tmp, dirty_list, list) {
- if (nat_get_blkaddr(ne) == NEW_ADDR)
- continue;
- add_nat_entry(ne, set_list);
- nm_i->dirty_nat_cnt++;
- }
- write_unlock(&nm_i->nat_tree_lock);
-}
-
-static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
-{
- if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
- return true;
- else
- return false;
-}
-
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1856,99 +1824,130 @@ found:
mutex_unlock(&curseg->curseg_mutex);
}
-/*
- * This function is called during the checkpointing process.
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+ struct list_head *head, int max)
{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
- struct nat_entry_set *nes, *tmp;
- struct list_head *head = &nm_i->nat_entry_set;
- bool to_journal = true;
+ struct nat_entry_set *cur;
- /* merge nat entries of dirty list to nat entry set temporarily */
- merge_nats_in_set(sbi);
+ if (nes->entry_cnt >= max)
+ goto add_out;
- /*
- * if there are no enough space in journal to store dirty nat
- * entries, remove all entries from journal and merge them
- * into nat entry set.
- */
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
- remove_nats_in_journal(sbi);
-
- /*
- * merge nat entries of dirty list to nat entry set temporarily
- */
- merge_nats_in_set(sbi);
+ list_for_each_entry(cur, head, set_list) {
+ if (cur->entry_cnt >= nes->entry_cnt) {
+ list_add(&nes->set_list, cur->set_list.prev);
+ return;
+ }
}
+add_out:
+ list_add_tail(&nes->set_list, head);
+}
- if (!nm_i->dirty_nat_cnt)
- return;
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+ struct nat_entry_set *set)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+ bool to_journal = true;
+ struct f2fs_nat_block *nat_blk;
+ struct nat_entry *ne, *cur;
+ struct page *page = NULL;
/*
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- list_for_each_entry_safe(nes, tmp, head, set_list) {
- struct f2fs_nat_block *nat_blk;
- struct nat_entry *ne, *cur;
- struct page *page;
- nid_t start_nid = nes->start_nid;
+ if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ to_journal = false;
- if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
- to_journal = false;
+ if (to_journal) {
+ mutex_lock(&curseg->curseg_mutex);
+ } else {
+ page = get_next_nat_page(sbi, start_nid);
+ nat_blk = page_address(page);
+ f2fs_bug_on(sbi, !nat_blk);
+ }
+
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+ struct f2fs_nat_entry *raw_ne;
+ nid_t nid = nat_get_nid(ne);
+ int offset;
+
+ if (nat_get_blkaddr(ne) == NEW_ADDR)
+ continue;
if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ offset = lookup_journal_in_cursum(sum,
+ NAT_JOURNAL, nid, 1);
+ f2fs_bug_on(sbi, offset < 0);
+ raw_ne = &nat_in_journal(sum, offset);
+ nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else {
- page = get_next_nat_page(sbi, start_nid);
- nat_blk = page_address(page);
- f2fs_bug_on(!nat_blk);
+ raw_ne = &nat_blk->entries[nid - start_nid];
}
+ raw_nat_from_node_info(raw_ne, &ne->ni);
- /* flush dirty nats in nat entry set */
- list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
- struct f2fs_nat_entry *raw_ne;
- nid_t nid = nat_get_nid(ne);
- int offset;
+ write_lock(&NM_I(sbi)->nat_tree_lock);
+ nat_reset_flag(ne);
+ __clear_nat_cache_dirty(NM_I(sbi), ne);
+ write_unlock(&NM_I(sbi)->nat_tree_lock);
- if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
- NAT_JOURNAL, nid, 1);
- f2fs_bug_on(offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
- } else {
- raw_ne = &nat_blk->entries[nid - start_nid];
- }
- raw_nat_from_node_info(raw_ne, &ne->ni);
+ if (nat_get_blkaddr(ne) == NULL_ADDR)
+ add_free_nid(sbi, nid, false);
+ }
- if (nat_get_blkaddr(ne) == NULL_ADDR &&
- add_free_nid(sbi, nid, false) <= 0) {
- write_lock(&nm_i->nat_tree_lock);
- __del_from_nat_cache(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- } else {
- write_lock(&nm_i->nat_tree_lock);
- __clear_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- }
- }
+ if (to_journal)
+ mutex_unlock(&curseg->curseg_mutex);
+ else
+ f2fs_put_page(page, 1);
- if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
- else
- f2fs_put_page(page, 1);
+ if (!set->entry_cnt) {
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+ }
+}
- release_nat_entry_set(nes, nm_i);
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct nat_entry_set *setvec[NATVEC_SIZE];
+ struct nat_entry_set *set, *tmp;
+ unsigned int found;
+ nid_t set_idx = 0;
+ LIST_HEAD(sets);
+
+ /*
+ * if there are no enough space in journal to store dirty nat
+ * entries, remove all entries from journal and merge them
+ * into nat entry set.
+ */
+ if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ remove_nats_in_journal(sbi);
+
+ if (!nm_i->dirty_nat_cnt)
+ return;
+
+ while ((found = __gang_lookup_nat_set(nm_i,
+ set_idx, NATVEC_SIZE, setvec))) {
+ unsigned idx;
+ set_idx = setvec[found - 1]->set + 1;
+ for (idx = 0; idx < found; idx++)
+ __adjust_nat_entry_set(setvec[idx], &sets,
+ MAX_NAT_JENTRIES(sum));
}
- f2fs_bug_on(!list_empty(head));
- f2fs_bug_on(nm_i->dirty_nat_cnt);
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(set, tmp, &sets, set_list)
+ __flush_nat_entry_set(sbi, set);
+
+ f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1967,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - 3;
+ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
@@ -1975,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
- INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
- INIT_LIST_HEAD(&nm_i->nat_entry_set);
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2026,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
/* destroy free nid list */
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(i->state == NID_ALLOC);
+ f2fs_bug_on(sbi, i->state == NID_ALLOC);
__del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
spin_unlock(&nm_i->free_nid_list_lock);
kmem_cache_free(free_nid_slab, i);
spin_lock(&nm_i->free_nid_list_lock);
}
- f2fs_bug_on(nm_i->fcnt);
+ f2fs_bug_on(sbi, nm_i->fcnt);
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
@@ -2045,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
for (idx = 0; idx < found; idx++)
__del_from_nat_cache(nm_i, natvec[idx]);
}
- f2fs_bug_on(nm_i->nat_cnt);
+ f2fs_bug_on(sbi, nm_i->nat_cnt);
write_unlock(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
unsigned char version; /* version of the node */
};
+enum {
+ IS_CHECKPOINTED, /* is it checkpointed before? */
+ HAS_FSYNCED_INODE, /* is the inode fsynced before? */
+ HAS_LAST_FSYNC, /* has the latest node fsync mark? */
+ IS_DIRTY, /* this nat entry is dirty? */
+};
+
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
- bool checkpointed; /* whether it is checkpointed or not */
- bool fsync_done; /* whether the latest node has fsync mark */
+ unsigned char flag; /* for node information bits */
struct node_info ni; /* in-memory node information */
};
@@ -55,18 +61,32 @@ struct nat_entry {
#define nat_get_version(nat) (nat->ni.version)
#define nat_set_version(nat, v) (nat->ni.version = v)
-#define __set_nat_cache_dirty(nm_i, ne) \
- do { \
- ne->checkpointed = false; \
- list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
- } while (0)
-#define __clear_nat_cache_dirty(nm_i, ne) \
- do { \
- ne->checkpointed = true; \
- list_move_tail(&ne->list, &nm_i->nat_entries); \
- } while (0)
#define inc_node_version(version) (++version)
+static inline void set_nat_flag(struct nat_entry *ne,
+ unsigned int type, bool set)
+{
+ unsigned char mask = 0x01 << type;
+ if (set)
+ ne->flag |= mask;
+ else
+ ne->flag &= ~mask;
+}
+
+static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+{
+ unsigned char mask = 0x01 << type;
+ return ne->flag & mask;
+}
+
+static inline void nat_reset_flag(struct nat_entry *ne)
+{
+ /* these states can be set only after checkpoint was done */
+ set_nat_flag(ne, IS_CHECKPOINTED, true);
+ set_nat_flag(ne, HAS_FSYNCED_INODE, false);
+ set_nat_flag(ne, HAS_LAST_FSYNC, true);
+}
+
static inline void node_info_from_raw_nat(struct node_info *ni,
struct f2fs_nat_entry *raw_ne)
{
@@ -90,9 +110,9 @@ enum mem_type {
};
struct nat_entry_set {
- struct list_head set_list; /* link with all nat sets */
+ struct list_head set_list; /* link with other nat sets */
struct list_head entry_list; /* link with dirty nat entries */
- nid_t start_nid; /* start nid of nats in set */
+ nid_t set; /* set number*/
unsigned int entry_cnt; /* the # of nat entries in set */
};
@@ -110,18 +130,19 @@ struct free_nid {
int state; /* in use or not: NID_NEW or NID_ALLOC */
};
-static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- if (nm_i->fcnt <= 0)
- return -1;
spin_lock(&nm_i->free_nid_list_lock);
+ if (nm_i->fcnt <= 0) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return;
+ }
fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
*nid = fnid->nid;
spin_unlock(&nm_i->free_nid_list_lock);
- return 0;
}
/*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
struct f2fs_node *rn = F2FS_NODE(page);
rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d921ba2..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
#include "node.h"
#include "segment.h"
+/*
+ * Roll forward recovery scenarios.
+ *
+ * [Term] F: fsync_mark, D: dentry_mark
+ *
+ * 1. inode(x) | CP | inode(x) | dnode(F)
+ * -> Update the latest inode(x).
+ *
+ * 2. inode(x) | CP | inode(F) | dnode(F)
+ * -> No problem.
+ *
+ * 3. inode(x) | CP | dnode(F) | inode(x)
+ * -> Recover to the latest dnode(F), and drop the last inode(x)
+ *
+ * 4. inode(x) | CP | dnode(F) | inode(F)
+ * -> No problem.
+ *
+ * 5. CP | inode(x) | dnode(F)
+ * -> The inode(DF) was missing. Should drop this dnode(F).
+ *
+ * 6. CP | inode(DF) | dnode(F)
+ * -> No problem.
+ *
+ * 7. CP | dnode(F) | inode(DF)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *
+ * 8. CP | dnode(F) | inode(x)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ * But it will fail due to no inode(DF).
+ */
+
static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static int recover_dentry(struct page *ipage, struct inode *inode)
+static int recover_dentry(struct inode *inode, struct page *ipage)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
}
retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino))
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
goto out_unmap_put;
+ }
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
@@ -73,7 +106,7 @@ retry:
err = -EEXIST;
goto out_unmap_put;
}
- err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+ err = acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
goto out_unmap_put;
@@ -108,35 +141,28 @@ out:
return err;
}
-static int recover_inode(struct inode *inode, struct page *node_page)
+static void recover_inode(struct inode *inode, struct page *page)
{
- struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
+ struct f2fs_inode *raw = F2FS_INODE(page);
- if (!IS_INODE(node_page))
- return 0;
-
- inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- i_size_write(inode, le64_to_cpu(raw_inode->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
- inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-
- if (is_dent_dnode(node_page))
- return recover_dentry(node_page, inode);
+ inode->i_mode = le16_to_cpu(raw->i_mode);
+ i_size_write(inode, le64_to_cpu(raw->i_size));
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
- ino_of_node(node_page), raw_inode->i_name);
- return 0;
+ ino_of_node(page), F2FS_INODE(page)->i_name);
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
block_t blkaddr;
int err = 0;
@@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ return 0;
- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
}
/* add this fsync inode to the list */
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
if (!entry) {
err = -ENOMEM;
break;
}
-
+ /*
+ * CP | dnode(F) | inode(DF)
+ * For this case, we should not give up now.
+ */
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
+ if (err == -ENOENT)
+ goto next;
break;
}
list_add_tail(&entry->list, head);
}
entry->blkaddr = blkaddr;
- err = recover_inode(entry->inode, page);
- if (err && err != -ENOENT)
- break;
+ if (IS_INODE(page)) {
+ entry->last_inode = blkaddr;
+ if (is_dent_dnode(page))
+ entry->last_dentry = blkaddr;
+ }
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
+ f2fs_put_page(page, 1);
return err;
}
@@ -277,16 +301,30 @@ got_it:
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
- /* Deallocate previous index in the node page */
- inode = f2fs_iget(sbi->sb, ino);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ if (ino != dn->inode->i_ino) {
+ /* Deallocate previous index in the node page */
+ inode = f2fs_iget(sbi->sb, ino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ } else {
+ inode = dn->inode;
+ }
bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
- truncate_hole(inode, bidx, bidx + 1);
- iput(inode);
+ if (ino != dn->inode->i_ino) {
+ truncate_hole(inode, bidx, bidx + 1);
+ iput(inode);
+ } else {
+ struct dnode_of_data tdn;
+ set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
+ if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+ return 0;
+ if (tdn.data_blkaddr != NULL_ADDR)
+ truncate_data_blocks_range(&tdn, 1);
+ f2fs_put_page(tdn.node_page, 1);
+ }
return 0;
}
@@ -300,14 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct node_info ni;
int err = 0, recovered = 0;
- recover_inline_xattr(inode, page);
-
- if (recover_inline_data(inode, page))
+ /* step 1: recover xattr */
+ if (IS_INODE(page)) {
+ recover_inline_xattr(inode, page);
+ } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+ recover_xattr_data(inode, page, blkaddr);
goto out;
+ }
- if (recover_xattr_data(inode, page, blkaddr))
+ /* step 2: recover inline data */
+ if (recover_inline_data(inode, page))
goto out;
+ /* step 3: recover data indices */
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
@@ -324,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
f2fs_wait_on_page_writeback(dn.node_page, NODE);
get_node_info(sbi, dn.nid, &ni);
- f2fs_bug_on(ni.ino != ino_of_node(page));
- f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
+ f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
+ f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
for (; start < end; start++) {
block_t src, dest;
@@ -337,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
/* We should not get -ENOSPC */
- f2fs_bug_on(err);
+ f2fs_bug_on(sbi, err);
}
/* Check the previous node page having this index */
@@ -364,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
set_page_dirty(dn.node_page);
-
- recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
err:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
@@ -381,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
int err = 0;
block_t blkaddr;
@@ -389,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
curseg = CURSEG_I(sbi, type);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
-
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ break;
- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page))
+ if (cp_ver != cpver_of_node(page)) {
+ f2fs_put_page(page, 1);
break;
+ }
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
goto next;
-
+ /*
+ * inode(x) | CP | inode(x) | dnode(F)
+ * In this case, we can lose the latest inode(x).
+ * So, call recover_inode for the inode update.
+ */
+ if (entry->last_inode == blkaddr)
+ recover_inode(entry->inode, page);
+ if (entry->last_dentry == blkaddr) {
+ err = recover_dentry(entry->inode, page);
+ if (err) {
+ f2fs_put_page(page, 1);
+ break;
+ }
+ }
err = do_recover_data(sbi, entry->inode, page, blkaddr);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
break;
+ }
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -424,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
if (!err)
allocate_new_segments(sbi);
return err;
@@ -452,6 +499,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #1: find fsynced inode numbers */
sbi->por_doing = true;
+ /* prevent checkpoint */
+ mutex_lock(&sbi->cp_mutex);
+
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,11 +515,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #2: recover data */
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- f2fs_bug_on(!list_empty(&inode_list));
+ if (!err)
+ f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
+ /* truncate meta pages to be used by the recovery */
+ truncate_inode_pages_range(META_MAPPING(sbi),
+ MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
@@ -482,8 +537,16 @@ out:
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
- write_checkpoint(sbi, false);
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
+ mutex_unlock(&sbi->cp_mutex);
+ write_checkpoint(sbi, &cpc);
+ } else {
+ mutex_unlock(&sbi->cp_mutex);
}
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeebae2a50..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *sit_entry_set_slab;
+static struct kmem_cache *inmem_entry_slab;
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
}
/*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
* Example:
* LSB <--> MSB
@@ -172,6 +174,60 @@ found_middle:
return result + __reverse_ffz(tmp);
}
+void register_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inmem_pages *new;
+
+ new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+
+ /* add atomic page indices to the list */
+ new->page = page;
+ INIT_LIST_HEAD(&new->list);
+
+ /* increase reference count with clean state */
+ mutex_lock(&fi->inmem_lock);
+ get_page(page);
+ list_add_tail(&new->list, &fi->inmem_pages);
+ mutex_unlock(&fi->inmem_lock);
+}
+
+void commit_inmem_pages(struct inode *inode, bool abort)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inmem_pages *cur, *tmp;
+ bool submit_bio = false;
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = WRITE_SYNC,
+ };
+
+ f2fs_balance_fs(sbi);
+ f2fs_lock_op(sbi);
+
+ mutex_lock(&fi->inmem_lock);
+ list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+ lock_page(cur->page);
+ if (!abort && cur->page->mapping == inode->i_mapping) {
+ f2fs_wait_on_page_writeback(cur->page, DATA);
+ if (clear_page_dirty_for_io(cur->page))
+ inode_dec_dirty_pages(inode);
+ do_write_data_page(cur->page, &fio);
+ submit_bio = true;
+ }
+ f2fs_put_page(cur->page, 1);
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ }
+ if (submit_bio)
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ mutex_unlock(&fi->inmem_lock);
+
+ filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+ f2fs_unlock_op(sbi);
+}
+
/*
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
if (kthread_should_stop())
return 0;
- spin_lock(&fcc->issue_lock);
- if (fcc->issue_list) {
- fcc->dispatch_list = fcc->issue_list;
- fcc->issue_list = fcc->issue_tail = NULL;
- }
- spin_unlock(&fcc->issue_lock);
-
- if (fcc->dispatch_list) {
+ if (!llist_empty(&fcc->issue_list)) {
struct bio *bio = bio_alloc(GFP_NOIO, 0);
struct flush_cmd *cmd, *next;
int ret;
+ fcc->dispatch_list = llist_del_all(&fcc->issue_list);
+ fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
+
bio->bi_bdev = sbi->sb->s_bdev;
ret = submit_bio_wait(WRITE_FLUSH, bio);
- for (cmd = fcc->dispatch_list; cmd; cmd = next) {
+ llist_for_each_entry_safe(cmd, next,
+ fcc->dispatch_list, llnode) {
cmd->ret = ret;
- next = cmd->next;
complete(&cmd->wait);
}
bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
}
wait_event_interruptible(*q,
- kthread_should_stop() || fcc->issue_list);
+ kthread_should_stop() || !llist_empty(&fcc->issue_list));
goto repeat;
}
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
init_completion(&cmd.wait);
- cmd.next = NULL;
- spin_lock(&fcc->issue_lock);
- if (fcc->issue_list)
- fcc->issue_tail->next = &cmd;
- else
- fcc->issue_list = &cmd;
- fcc->issue_tail = &cmd;
- spin_unlock(&fcc->issue_lock);
+ llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
- spin_lock_init(&fcc->issue_lock);
init_waitqueue_head(&fcc->flush_wait_queue);
+ init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
struct seg_entry *sentry = get_seg_entry(sbi, segno);
enum dirty_type t = sentry->type;
+ if (unlikely(t >= DIRTY)) {
+ f2fs_bug_on(sbi, 1);
+ return;
+ }
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]++;
}
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
- sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
- sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+ sector_t start = SECTOR_FROM_BLOCK(blkstart);
+ sector_t len = SECTOR_FROM_BLOCK(blklen);
trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
}
@@ -392,21 +441,47 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
}
}
-static void add_discard_addrs(struct f2fs_sb_info *sbi,
- unsigned int segno, struct seg_entry *se)
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct list_head *head = &SM_I(sbi)->discard_list;
struct discard_entry *new;
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
int max_blocks = sbi->blocks_per_seg;
+ struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
unsigned long dmap[entries];
unsigned int start = 0, end = -1;
+ bool force = (cpc->reason == CP_DISCARD);
int i;
- if (!test_opt(sbi, DISCARD))
+ if (!force && !test_opt(sbi, DISCARD))
+ return;
+
+ if (force && !se->valid_blocks) {
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ /*
+ * if this segment is registered in the prefree list, then
+ * we should skip adding a discard candidate, and let the
+ * checkpoint do that later.
+ */
+ mutex_lock(&dirty_i->seglist_lock);
+ if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
+ mutex_unlock(&dirty_i->seglist_lock);
+ cpc->trimmed += sbi->blocks_per_seg;
+ return;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&new->list);
+ new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ new->len = sbi->blocks_per_seg;
+ list_add_tail(&new->list, head);
+ SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
+ cpc->trimmed += sbi->blocks_per_seg;
return;
+ }
/* zero block will be discarded through the prefree list */
if (!se->valid_blocks || se->valid_blocks == max_blocks)
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
for (i = 0; i < entries; i++)
dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
- while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+ while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
if (start >= max_blocks)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+ if (end - start < cpc->trim_minlen)
+ continue;
+
new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, segno) + start;
+ new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
new->len = end - start;
+ cpc->trimmed += end - start;
list_add_tail(&new->list, head);
SM_I(sbi)->nr_discards += end - start;
}
}
+void release_discard_addrs(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct discard_entry *entry, *this;
+
+ /* drop caches */
+ list_for_each_entry_safe(entry, this, head, list) {
+ list_del(&entry->list);
+ kmem_cache_free(discard_entry_slab, entry);
+ }
+}
+
/*
* Should call clear_prefree_segments after checkpoint is done.
*/
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int segno;
- unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
- for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs)
+ for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
__set_test_and_free(sbi, segno);
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
- unsigned int total_segs = TOTAL_SEGS(sbi);
unsigned int start = 0, end = -1;
mutex_lock(&dirty_i->seglist_lock);
while (1) {
int i;
- start = find_next_bit(prefree_map, total_segs, end + 1);
- if (start >= total_segs)
+ start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
+ if (start >= MAIN_SEGS(sbi))
break;
- end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+ end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
+ start + 1);
for (i = start; i < end; i++)
clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
}
}
-static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+
+ if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
sit_i->dirty_sentries++;
+ return false;
+ }
+
+ return true;
}
static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
new_vblocks = se->valid_blocks + del;
offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
- f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
+ f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
if (f2fs_set_bit(offset, se->cur_valid_map))
- BUG();
+ f2fs_bug_on(sbi, 1);
} else {
if (!f2fs_clear_bit(offset, se->cur_valid_map))
- BUG();
+ f2fs_bug_on(sbi, 1);
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
unsigned int segno = GET_SEGNO(sbi, addr);
struct sit_info *sit_i = SIT_I(sbi);
- f2fs_bug_on(addr == NULL_ADDR);
+ f2fs_bug_on(sbi, addr == NULL_ADDR);
if (addr == NEW_ADDR)
return;
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
- unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
+ unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = *newseg / sbi->segs_per_sec;
unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- TOTAL_SEGS(sbi), *newseg + 1);
+ MAIN_SEGS(sbi), *newseg + 1);
if (segno - *newseg < sbi->segs_per_sec -
(*newseg % sbi->segs_per_sec))
goto got_it;
}
find_other_zone:
- secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
- if (secno >= TOTAL_SECS(sbi)) {
+ secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+ if (secno >= MAIN_SECS(sbi)) {
if (dir == ALLOC_RIGHT) {
secno = find_next_zero_bit(free_i->free_secmap,
- TOTAL_SECS(sbi), 0);
- f2fs_bug_on(secno >= TOTAL_SECS(sbi));
+ MAIN_SECS(sbi), 0);
+ f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
} else {
go_left = 1;
left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
continue;
}
left_start = find_next_zero_bit(free_i->free_secmap,
- TOTAL_SECS(sbi), 0);
- f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
+ MAIN_SECS(sbi), 0);
+ f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
break;
}
secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
}
got_it:
/* set it as dirty segment in free segmap */
- f2fs_bug_on(test_bit(segno, free_i->free_segmap));
+ f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
__set_inuse(sbi, segno);
*newseg = segno;
write_unlock(&free_i->segmap_lock);
@@ -808,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
}
/*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
+{
+ __u64 start = range->start >> sbi->log_blocksize;
+ __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+ unsigned int start_segno, end_segno;
+ struct cp_control cpc;
+
+ if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
+ range->len < sbi->blocksize)
+ return -EINVAL;
+
+ if (end <= MAIN_BLKADDR(sbi))
+ goto out;
+
+ /* start/end segment number in main_area */
+ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
+ end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
+ GET_SEGNO(sbi, end);
+ cpc.reason = CP_DISCARD;
+ cpc.trim_start = start_segno;
+ cpc.trim_end = end_segno;
+ cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+ cpc.trimmed = 0;
+
+ /* do checkpoint to issue discard commands safely */
+ write_checkpoint(sbi, &cpc);
+out:
+ range->len = cpc.trimmed << sbi->log_blocksize;
+ return 0;
+}
+
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
static int __get_segment_type(struct page *page, enum page_type p_type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
- switch (sbi->active_logs) {
+ switch (F2FS_P_SB(page)->active_logs) {
case 2:
return __get_segment_type_2(page, p_type);
case 4:
return __get_segment_type_4(page, p_type);
}
/* NR_CURSEG_TYPE(6) logs by default */
- f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
+ f2fs_bug_on(F2FS_P_SB(page),
+ F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
return __get_segment_type_6(page, p_type);
}
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
void write_data_page(struct page *page, struct dnode_of_data *dn,
block_t *new_blkaddr, struct f2fs_io_info *fio)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
- f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
+ f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
void rewrite_data_page(struct page *page, block_t old_blkaddr,
struct f2fs_io_info *fio)
{
- struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
+ f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
}
void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1103,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
mutex_unlock(&curseg->curseg_mutex);
}
-void rewrite_node_page(struct f2fs_sb_info *sbi,
- struct page *page, struct f2fs_summary *sum,
- block_t old_blkaddr, block_t new_blkaddr)
-{
- struct sit_info *sit_i = SIT_I(sbi);
- int type = CURSEG_WARM_NODE;
- struct curseg_info *curseg;
- unsigned int segno, old_cursegno;
- block_t next_blkaddr = next_blkaddr_of_node(page);
- unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
- struct f2fs_io_info fio = {
- .type = NODE,
- .rw = WRITE_SYNC,
- };
-
- curseg = CURSEG_I(sbi, type);
-
- mutex_lock(&curseg->curseg_mutex);
- mutex_lock(&sit_i->sentry_lock);
-
- segno = GET_SEGNO(sbi, new_blkaddr);
- old_cursegno = curseg->segno;
-
- /* change the current segment */
- if (segno != curseg->segno) {
- curseg->next_segno = segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
- __add_sum_entry(sbi, type, sum);
-
- /* change the current log to the next block addr in advance */
- if (next_segno != segno) {
- curseg->next_segno = next_segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-
- /* rewrite node page */
- set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
- locate_dirty_segment(sbi, old_cursegno);
-
- mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
-}
-
static inline bool is_merged_page(struct f2fs_sb_info *sbi,
struct page *page, enum page_type type)
{
@@ -1179,8 +1254,9 @@ out:
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
if (PageWriteback(page)) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+
if (is_merged_page(sbi, page, type))
f2fs_submit_merged_bio(sbi, type, WRITE);
wait_on_page_writeback(page);
@@ -1449,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+ unsigned int offset = SIT_BLOCK_OFFSET(segno);
block_t blk_addr = sit_i->sit_base_addr + offset;
check_seg_range(sbi, segno);
@@ -1475,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
/* get current sit block page without lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(PageDirty(src_page));
+ f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -1489,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
return dst_page;
}
-static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+static struct sit_entry_set *grab_sit_entry_set(void)
+{
+ struct sit_entry_set *ses =
+ f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+
+ ses->entry_cnt = 0;
+ INIT_LIST_HEAD(&ses->set_list);
+ return ses;
+}
+
+static void release_sit_entry_set(struct sit_entry_set *ses)
+{
+ list_del(&ses->set_list);
+ kmem_cache_free(sit_entry_set_slab, ses);
+}
+
+static void adjust_sit_entry_set(struct sit_entry_set *ses,
+ struct list_head *head)
+{
+ struct sit_entry_set *next = ses;
+
+ if (list_is_last(&ses->set_list, head))
+ return;
+
+ list_for_each_entry_continue(next, head, set_list)
+ if (ses->entry_cnt <= next->entry_cnt)
+ break;
+
+ list_move_tail(&ses->set_list, &next->set_list);
+}
+
+static void add_sit_entry(unsigned int segno, struct list_head *head)
+{
+ struct sit_entry_set *ses;
+ unsigned int start_segno = START_SEGNO(segno);
+
+ list_for_each_entry(ses, head, set_list) {
+ if (ses->start_segno == start_segno) {
+ ses->entry_cnt++;
+ adjust_sit_entry_set(ses, head);
+ return;
+ }
+ }
+
+ ses = grab_sit_entry_set();
+
+ ses->start_segno = start_segno;
+ ses->entry_cnt++;
+ list_add(&ses->set_list, head);
+}
+
+static void add_sits_in_set(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ struct list_head *set_list = &sm_info->sit_entry_set;
+ unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
+ unsigned int segno;
+
+ for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
+ add_sit_entry(segno, set_list);
+}
+
+static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
int i;
- /*
- * If the journal area in the current summary is full of sit entries,
- * all the sit entries will be flushed. Otherwise the sit entries
- * are not able to replace with newly hot sit entries.
- */
- if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
- for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
- unsigned int segno;
- segno = le32_to_cpu(segno_in_journal(sum, i));
- __mark_sit_entry_dirty(sbi, segno);
- }
- update_sits_in_cursum(sum, -sits_in_cursum(sum));
- return true;
+ for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ unsigned int segno;
+ bool dirtied;
+
+ segno = le32_to_cpu(segno_in_journal(sum, i));
+ dirtied = __mark_sit_entry_dirty(sbi, segno);
+
+ if (!dirtied)
+ add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
- return false;
+ update_sits_in_cursum(sum, -sits_in_cursum(sum));
}
/*
* CP calls this function, which flushes SIT entries including sit_journal,
* and moves prefree segs to free segs.
*/
-void flush_sit_entries(struct f2fs_sb_info *sbi)
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- unsigned long nsegs = TOTAL_SEGS(sbi);
- struct page *page = NULL;
- struct f2fs_sit_block *raw_sit = NULL;
- unsigned int start = 0, end = 0;
- unsigned int segno;
- bool flushed;
+ struct sit_entry_set *ses, *tmp;
+ struct list_head *head = &SM_I(sbi)->sit_entry_set;
+ bool to_journal = true;
+ struct seg_entry *se;
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
/*
- * "flushed" indicates whether sit entries in journal are flushed
- * to the SIT area or not.
+ * add and account sit entries of dirty bitmap in sit entry
+ * set temporarily
*/
- flushed = flush_sits_in_journal(sbi);
+ add_sits_in_set(sbi);
- for_each_set_bit(segno, bitmap, nsegs) {
- struct seg_entry *se = get_seg_entry(sbi, segno);
- int sit_offset, offset;
+ /*
+ * if there are no enough space in journal to store dirty sit
+ * entries, remove all entries from journal and add and account
+ * them in sit entry set.
+ */
+ if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+ remove_sits_in_journal(sbi);
- sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+ if (!sit_i->dirty_sentries)
+ goto out;
- /* add discard candidates */
- if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
- add_discard_addrs(sbi, segno, se);
+ /*
+ * there are two steps to flush sit entries:
+ * #1, flush sit entries to journal in current cold data summary block.
+ * #2, flush sit entries to sit page.
+ */
+ list_for_each_entry_safe(ses, tmp, head, set_list) {
+ struct page *page;
+ struct f2fs_sit_block *raw_sit = NULL;
+ unsigned int start_segno = ses->start_segno;
+ unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
+ (unsigned long)MAIN_SEGS(sbi));
+ unsigned int segno = start_segno;
+
+ if (to_journal &&
+ !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+ to_journal = false;
+
+ if (!to_journal) {
+ page = get_next_sit_page(sbi, start_segno);
+ raw_sit = page_address(page);
+ }
- if (flushed)
- goto to_sit_page;
+ /* flush dirty sit entries in region of current sit set */
+ for_each_set_bit_from(segno, bitmap, end) {
+ int offset, sit_offset;
- offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
- if (offset >= 0) {
- segno_in_journal(sum, offset) = cpu_to_le32(segno);
- seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
- goto flush_done;
- }
-to_sit_page:
- if (!page || (start > segno) || (segno > end)) {
- if (page) {
- f2fs_put_page(page, 1);
- page = NULL;
+ se = get_seg_entry(sbi, segno);
+
+ /* add discard candidates */
+ if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+ cpc->trim_start = segno;
+ add_discard_addrs(sbi, cpc);
}
- start = START_SEGNO(sit_i, segno);
- end = start + SIT_ENTRY_PER_BLOCK - 1;
+ if (to_journal) {
+ offset = lookup_journal_in_cursum(sum,
+ SIT_JOURNAL, segno, 1);
+ f2fs_bug_on(sbi, offset < 0);
+ segno_in_journal(sum, offset) =
+ cpu_to_le32(segno);
+ seg_info_to_raw_sit(se,
+ &sit_in_journal(sum, offset));
+ } else {
+ sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+ seg_info_to_raw_sit(se,
+ &raw_sit->entries[sit_offset]);
+ }
- /* read sit block that will be updated */
- page = get_next_sit_page(sbi, start);
- raw_sit = page_address(page);
+ __clear_bit(segno, bitmap);
+ sit_i->dirty_sentries--;
+ ses->entry_cnt--;
}
- /* udpate entry in SIT block */
- seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
-flush_done:
- __clear_bit(segno, bitmap);
- sit_i->dirty_sentries--;
+ if (!to_journal)
+ f2fs_put_page(page, 1);
+
+ f2fs_bug_on(sbi, ses->entry_cnt);
+ release_sit_entry_set(ses);
+ }
+
+ f2fs_bug_on(sbi, !list_empty(head));
+ f2fs_bug_on(sbi, sit_i->dirty_sentries);
+out:
+ if (cpc->reason == CP_DISCARD) {
+ for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
+ add_discard_addrs(sbi, cpc);
}
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- /* writeout last modified SIT block */
- f2fs_put_page(page, 1);
-
set_prefree_as_free_segments(sbi);
}
@@ -1603,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
if (!sit_i->sentries)
return -ENOMEM;
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
sit_i->sentries[start].cur_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
@@ -1623,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
}
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
+ sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry));
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -1658,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
static int build_free_segmap(struct f2fs_sb_info *sbi)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
struct free_segmap_info *free_i;
unsigned int bitmap_size, sec_bitmap_size;
@@ -1669,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
- sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+ sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -1684,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
memset(free_i->free_secmap, 0xff, sec_bitmap_size);
/* init free segmap information */
- free_i->start_segno =
- (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
free_i->free_segments = 0;
free_i->free_sections = 0;
rwlock_init(&free_i->segmap_lock);
@@ -1722,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int nrpages = MAX_BIO_BLOCKS(sbi);
do {
readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1730,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
- for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+ for (; start < end && start < MAIN_SEGS(sbi); start++) {
struct seg_entry *se = &sit_i->sentries[start];
struct f2fs_sit_block *sit_blk;
struct f2fs_sit_entry sit;
@@ -1768,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
unsigned int start;
int type;
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
@@ -1785,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
+ unsigned int segno = 0, offset = 0;
unsigned short valid_blocks;
while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, total_segs, offset);
- if (segno >= total_segs)
+ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
+ if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
- if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+ if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+ continue;
+ if (valid_blocks > sbi->blocks_per_seg) {
+ f2fs_bug_on(sbi, 1);
continue;
+ }
mutex_lock(&dirty_i->seglist_lock);
__locate_dirty_segment(sbi, segno, DIRTY);
mutex_unlock(&dirty_i->seglist_lock);
@@ -1806,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
static int init_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+ unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
@@ -1827,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->dirty_info = dirty_i;
mutex_init(&dirty_i->seglist_lock);
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1851,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = LLONG_MAX;
- for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
unsigned int i;
unsigned long long mtime = 0;
@@ -1889,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = sm_info->main_segments *
DEF_RECLAIM_PREFREE_SEGMENTS / 100;
- sm_info->ipu_policy = F2FS_IPU_DISABLE;
+ sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+ sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
INIT_LIST_HEAD(&sm_info->discard_list);
sm_info->nr_discards = 0;
sm_info->max_discards = 0;
+ INIT_LIST_HEAD(&sm_info->sit_entry_set);
+
if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
err = create_flush_cmd_control(sbi);
if (err)
@@ -1991,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
return;
if (sit_i->sentries) {
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
kfree(sit_i->sentries[start].cur_valid_map);
kfree(sit_i->sentries[start].ckpt_valid_map);
}
@@ -2025,11 +2197,30 @@ int __init create_segment_manager_caches(void)
discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
- return -ENOMEM;
+ goto fail;
+
+ sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+ sizeof(struct nat_entry_set));
+ if (!sit_entry_set_slab)
+ goto destory_discard_entry;
+
+ inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+ sizeof(struct inmem_pages));
+ if (!inmem_entry_slab)
+ goto destroy_sit_entry_set;
return 0;
+
+destroy_sit_entry_set:
+ kmem_cache_destroy(sit_entry_set_slab);
+destory_discard_entry:
+ kmem_cache_destroy(discard_entry_slab);
+fail:
+ return -ENOMEM;
}
void destroy_segment_manager_caches(void)
{
+ kmem_cache_destroy(sit_entry_set_slab);
kmem_cache_destroy(discard_entry_slab);
+ kmem_cache_destroy(inmem_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7b0330..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
(secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
sbi->segs_per_sec)) \
-#define START_BLOCK(sbi, segno) \
- (SM_I(sbi)->seg0_blkaddr + \
+#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
+#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
+
+#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
+#define MAIN_SECS(sbi) (sbi->total_sections)
+
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+
+#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
+#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
+ sbi->log_blocks_per_seg))
+
+#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
(GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+
#define NEXT_FREE_BLKADDR(sbi, curseg) \
(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
-#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
-
-#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
- ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
@@ -77,23 +87,21 @@
#define SIT_ENTRY_OFFSET(sit_i, segno) \
(segno % sit_i->sents_per_block)
-#define SIT_BLOCK_OFFSET(sit_i, segno) \
+#define SIT_BLOCK_OFFSET(segno) \
(segno / SIT_ENTRY_PER_BLOCK)
-#define START_SEGNO(sit_i, segno) \
- (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define START_SEGNO(segno) \
+ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
- ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+ ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
#define f2fs_bitmap_size(nr) \
(BITS_TO_LONGS(nr) * sizeof(unsigned long))
-#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define TOTAL_SECS(sbi) (sbi->total_sections)
-#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
- (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
-#define SECTOR_TO_BLOCK(sbi, sectors) \
- (sectors >> (sbi)->log_sectors_per_block)
-#define MAX_BIO_BLOCKS(max_hw_blocks) \
- (min((int)max_hw_blocks, BIO_MAX_PAGES))
+#define SECTOR_FROM_BLOCK(blk_addr) \
+ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
+#define SECTOR_TO_BLOCK(sectors) \
+ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+#define MAX_BIO_BLOCKS(sbi) \
+ ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
};
+struct inmem_pages {
+ struct list_head list;
+ struct page *page;
+};
+
struct sit_info {
const struct segment_allocation *s_ops;
@@ -237,6 +250,12 @@ struct curseg_info {
unsigned int next_segno; /* preallocated segment */
};
+struct sit_entry_set {
+ struct list_head set_list; /* link with all sit sets */
+ unsigned int start_segno; /* start segno of sits in set */
+ unsigned int entry_cnt; /* the # of sit entries in set */
+};
+
/*
* inline functions
*/
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
clear_bit(segno, free_i->free_segmap);
free_i->free_segments++;
- next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+ next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
- return (prefree_segments(sbi) / sbi->segs_per_sec)
- + free_sections(sbi) < overprovision_sections(sbi);
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi) + 1);
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* F2FS_IPU_UTIL - if FS utilization is over threashold,
* F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
* threashold,
+ * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
+ * storages. IPU will be triggered only if the # of dirty
+ * pages over min_fsync_blocks.
* F2FS_IPUT_DISABLE - disable IPU. (=default option)
*/
#define DEF_MIN_IPU_UTIL 70
+#define DEF_MIN_FSYNC_BLOCKS 8
enum {
F2FS_IPU_FORCE,
F2FS_IPU_SSR,
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
- F2FS_IPU_DISABLE,
+ F2FS_IPU_FSYNC,
};
static inline bool need_inplace_update(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int policy = SM_I(sbi)->ipu_policy;
/* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
return false;
- /* this is only set during fdatasync */
- if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+ if (policy & (0x1 << F2FS_IPU_FORCE))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_UTIL) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
- switch (SM_I(sbi)->ipu_policy) {
- case F2FS_IPU_FORCE:
+ /* this is only set during fdatasync */
+ if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+ is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
return true;
- case F2FS_IPU_SSR:
- if (need_SSR(sbi))
- return true;
- break;
- case F2FS_IPU_UTIL:
- if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
- break;
- case F2FS_IPU_SSR_UTIL:
- if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
- break;
- case F2FS_IPU_DISABLE:
- break;
- }
+
return false;
}
@@ -534,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
#ifdef CONFIG_F2FS_CHECK_FS
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
- unsigned int end_segno = SM_I(sbi)->segment_count - 1;
- BUG_ON(segno > end_segno);
+ BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
}
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
- block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
- block_t start_addr = sm_info->seg0_blkaddr;
- block_t end_addr = start_addr + total_blks - 1;
- BUG_ON(blk_addr < start_addr);
- BUG_ON(blk_addr > end_addr);
+ BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
+ BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
}
/*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
*/
static inline void check_block_count(struct f2fs_sb_info *sbi,
int segno, struct f2fs_sit_entry *raw_sit)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
- unsigned int end_segno = sm_info->segment_count - 1;
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
/* check boundary of a given segment number */
- BUG_ON(segno > end_segno);
+ BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
/* check bitmap with valid block count */
do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
}
#else
-#define check_seg_range(sbi, segno)
-#define verify_block_addr(sbi, blk_addr)
-#define check_block_count(sbi, segno, raw_sit)
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ if (segno > TOTAL_SEGS(sbi) - 1)
+ sbi->need_fsck = true;
+}
+
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+ if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
+ sbi->need_fsck = true;
+}
+
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+ int segno, struct f2fs_sit_entry *raw_sit)
+{
+ /* check segment usage */
+ if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
+ sbi->need_fsck = true;
+
+ /* check boundary of a given segment number */
+ if (segno > TOTAL_SEGS(sbi) - 1)
+ sbi->need_fsck = true;
+}
#endif
static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int start)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+ unsigned int offset = SIT_BLOCK_OFFSET(start);
block_t blk_addr = sit_i->sit_base_addr + offset;
check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
{
- unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+ unsigned int block_off = SIT_BLOCK_OFFSET(start);
if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
{
struct block_device *bdev = sbi->sb->s_bdev;
struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+ return SECTOR_TO_BLOCK(queue_max_sectors(q));
}
/*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 3 * sbi->blocks_per_seg;
else if (type == META)
- return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ return MAX_BIO_BLOCKS(sbi);
else
return 0;
}
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
else if (type == NODE)
desired = 3 * max_hw_blocks(sbi);
else
- desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ desired = MAX_BIO_BLOCKS(sbi);
wbc->nr_to_write = desired;
return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582fc7601..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_small_discards),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
+ ATTR_LIST(min_fsync_blocks),
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
- atomic_set(&fi->dirty_dents, 0);
+ atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
rwlock_init(&fi->ext.ext_lock);
init_rwsem(&fi->i_sem);
+ INIT_LIST_HEAD(&fi->inmem_pages);
+ mutex_init(&fi->inmem_lock);
set_inode_flag(fi, FI_NEW_INODE);
@@ -432,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
stop_gc_thread(sbi);
/* We don't need to do checkpoint when it's clean */
- if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
- write_checkpoint(sbi, true);
+ if (sbi->s_dirty) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT,
+ };
+ write_checkpoint(sbi, &cpc);
+ }
+
+ /*
+ * normally superblock is clean, so we need to release this.
+ * In addition, EIO will skip do checkpoint, we need this as well.
+ */
+ release_dirty_inode(sbi);
+ release_discard_addrs(sbi);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -457,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
trace_f2fs_sync_fs(sb, sync);
- if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
- return 0;
-
if (sync) {
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
} else {
f2fs_balance_fs(sbi);
@@ -505,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
- buf->f_files = sbi->total_node_count;
- buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+ buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+ buf->f_ffree = buf->f_files - valid_inode_count(sbi);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -613,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
+ sbi->mount_opt.opt = 0;
+ sbi->active_logs = NR_CURSEG_TYPE;
+
/* parse mount options */
err = parse_options(sb, data);
if (err)
@@ -663,7 +681,7 @@ restore_gc:
if (need_restart_gc) {
if (start_gc_thread(sbi))
f2fs_msg(sbi->sb, KERN_WARNING,
- "background gc thread is stop");
+ "background gc thread has stopped");
} else if (need_stop_gc) {
stop_gc_thread(sbi);
}
@@ -783,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
return 1;
}
- if (le32_to_cpu(raw_super->log_sectorsize) !=
- F2FS_LOG_SECTOR_SIZE) {
- f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
+ /* Currently, support 512/1024/2048/4096 bytes sector size */
+ if (le32_to_cpu(raw_super->log_sectorsize) >
+ F2FS_MAX_LOG_SECTOR_SIZE ||
+ le32_to_cpu(raw_super->log_sectorsize) <
+ F2FS_MIN_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
+ le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
- if (le32_to_cpu(raw_super->log_sectors_per_block) !=
- F2FS_LOG_SECTORS_PER_BLOCK) {
- f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
+ if (le32_to_cpu(raw_super->log_sectors_per_block) +
+ le32_to_cpu(raw_super->log_sectorsize) !=
+ F2FS_MAX_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid log sectors per block(%u) log sectorsize(%u)",
+ le32_to_cpu(raw_super->log_sectors_per_block),
+ le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
return 0;
@@ -812,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
- if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+ if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
}
@@ -846,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->need_fsck = false;
}
/*
@@ -899,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
+ bool retry = true;
int i;
+try_onemore:
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -1077,12 +1106,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto free_proc;
+ if (!retry)
+ sbi->need_fsck = true;
+
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
err = recover_fsync_data(sbi);
- if (err)
+ if (err) {
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%ld", err);
+ goto free_kobj;
+ }
}
/*
@@ -1123,6 +1157,13 @@ free_sb_buf:
brelse(raw_super_buf);
free_sbi:
kfree(sbi);
+
+ /* give only one another chance */
+ if (retry) {
+ retry = 0;
+ shrink_dcache_sb(sb);
+ goto try_onemore;
+ }
return err;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
static void *read_all_xattrs(struct inode *inode, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
size_t size = PAGE_SIZE, inline_size = 0;
void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
size_t inline_size = 0;
void *xattr_addr;
struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
alloc_nid_failed(sbi, new_nid);
return PTR_ERR(xpage);
}
- f2fs_bug_on(new_nid);
+ f2fs_bug_on(sbi, new_nid);
f2fs_wait_on_page_writeback(xpage, NODE);
} else {
struct dnode_of_data dn;
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
int free;
/*
* If value is NULL, it is remove operation.
- * In case of update operation, we caculate free.
+ * In case of update operation, we calculate free.
*/
free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
if (found)
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
const void *value, size_t size,
struct page *ipage, int flags)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
/* this case is only from init_inode_metadata */
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
submit_op_failed:
clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
+ fscache_unuse_cookie(object);
kfree(op);
_leave(" [EIO]");
return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
EXPORT_SYMBOL(__fscache_wait_on_page_write);
/*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+ return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+ HZ);
+}
+
+/*
* decide whether a page can be released, possibly by cancelling a store to it
* - we're allowed to sleep if __GFP_WAIT is flagged
*/
@@ -115,7 +128,10 @@ page_busy:
}
fscache_stat(&fscache_n_store_vmscan_wait);
- __fscache_wait_on_page_write(cookie, page);
+ if (!release_page_wait_timeout(cookie, page))
+ _debug("fscache writeout timeout page: %p{%lx}",
+ page, page->index);
+
gfp &= ~__GFP_WAIT;
goto try_again;
}
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
{
struct fscache_operation *op;
struct fscache_object *object;
- bool wake_cookie;
+ bool wake_cookie = false;
_enter("%p", cookie);
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
__fscache_use_cookie(cookie);
if (fscache_submit_exclusive_op(object, op) < 0)
- goto nobufs;
+ goto nobufs_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_attr_changed_ok);
fscache_put_operation(op);
_leave(" = 0");
return 0;
-nobufs:
+nobufs_dec:
wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
spin_unlock(&cookie->lock);
kfree(op);
if (wake_cookie)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 912061ac4baf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1305,6 +1305,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t start;
ssize_t ret = iov_iter_get_pages(ii,
&req->pages[req->num_pages],
+ *nbytesp - nbytes,
req->max_pages - req->num_pages,
&start);
if (ret < 0)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
* Returns: The length of the extent (minimum of one block)
*/
-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
{
const __be64 *end = (start + len);
const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
struct buffer_head *bh_map, struct metapath *mp,
const unsigned int sheight,
const unsigned int height,
- const unsigned int maxlen)
+ const size_t maxlen)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
} else {
/* Need to allocate indirect blocks */
ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
- dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+ dblks = min(maxlen, (size_t)(ptrs_per_blk -
+ mp->mp_list[end_of_metadata]));
if (height == ip->i_height) {
/* Writing into existing tree, extend tree down */
iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int bsize = sdp->sd_sb.sb_bsize;
- const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+ const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
const u64 *arr = sdp->sd_heightsize;
__be64 *ptr;
u64 size;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 26b3f952e6b1..7f4ed3daa38c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
#include <linux/aio.h>
+#include <linux/delay.h>
#include "gfs2.h"
#include "incore.h"
@@ -979,9 +980,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
unsigned int state;
int flags;
int error = 0;
+ int sleeptime;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
mutex_lock(&fp->f_fl_mutex);
@@ -1001,7 +1003,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
gfs2_holder_init(gl, state, flags, fl_gh);
gfs2_glock_put(gl);
}
- error = gfs2_glock_nq(fl_gh);
+ for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
+ error = gfs2_glock_nq(fl_gh);
+ if (error != GLR_TRYFAILED)
+ break;
+ fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+ fl_gh->gh_error = 0;
+ msleep(sleeptime);
+ }
if (error) {
gfs2_holder_uninit(fl_gh);
if (error == GLR_TRYFAILED)
@@ -1024,7 +1033,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
mutex_lock(&fp->f_fl_mutex);
flock_lock_file_wait(file, fl);
if (fl_gh->gh_gl) {
- gfs2_glock_dq_wait(fl_gh);
+ gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
}
mutex_unlock(&fp->f_fl_mutex);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
unsigned long gh_ip;
};
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
+
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
u64 rs_inum; /* Inode number for reservation */
/* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+ struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
+ struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
unsigned int rs_qa_qd_num;
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..fc8ac2ee0667 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
d = d_splice_alias(inode, dentry);
error = PTR_ERR(d);
- if (IS_ERR(d))
+ if (IS_ERR(d)) {
+ inode = ERR_CAST(d);
goto fail_gunlock;
+ }
error = 0;
if (file) {
if (S_ISREG(inode->i_mode)) {
@@ -840,8 +842,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
int error;
inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (!inode)
+ if (inode == NULL) {
+ d_add(dentry, NULL);
return NULL;
+ }
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -854,7 +858,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
d = d_splice_alias(inode, dentry);
if (IS_ERR(d)) {
- iput(inode);
gfs2_glock_dq_uninit(&gh);
return d;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff13d486..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
int val;
if (is_ancestor(root, sdp->sd_master_dir))
- seq_printf(s, ",meta");
+ seq_puts(s, ",meta");
if (args->ar_lockproto[0])
seq_printf(s, ",lockproto=%s", args->ar_lockproto);
if (args->ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
if (args->ar_hostdata[0])
seq_printf(s, ",hostdata=%s", args->ar_hostdata);
if (args->ar_spectator)
- seq_printf(s, ",spectator");
+ seq_puts(s, ",spectator");
if (args->ar_localflocks)
- seq_printf(s, ",localflocks");
+ seq_puts(s, ",localflocks");
if (args->ar_debug)
- seq_printf(s, ",debug");
+ seq_puts(s, ",debug");
if (args->ar_posix_acl)
- seq_printf(s, ",acl");
+ seq_puts(s, ",acl");
if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
char *state;
switch (args->ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",quota=%s", state);
}
if (args->ar_suiddir)
- seq_printf(s, ",suiddir");
+ seq_puts(s, ",suiddir");
if (args->ar_data != GFS2_DATA_DEFAULT) {
char *state;
switch (args->ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",data=%s", state);
}
if (args->ar_discard)
- seq_printf(s, ",discard");
+ seq_puts(s, ",discard");
val = sdp->sd_tune.gt_logd_secs;
if (val != 30)
seq_printf(s, ",commit=%d", val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",errors=%s", state);
}
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
- seq_printf(s, ",nobarrier");
+ seq_puts(s, ",nobarrier");
if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
- seq_printf(s, ",demote_interface_used");
+ seq_puts(s, ",demote_interface_used");
if (args->ar_rgrplvb)
- seq_printf(s, ",rgrplvb");
+ seq_puts(s, ",rgrplvb");
return 0;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
struct commit_header *h;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
return checksum;
}
-static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (tag_bytes > JBD2_TAG_SIZE32)
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
struct jbd2_journal_block_tail *tail;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
+ journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- /* We only have space to store the lower 16 bits of the crc32c. */
- tag->t_checksum = cpu_to_be16(csum32);
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ tag3->t_checksum = cpu_to_be32(csum32);
+ else
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
LIST_HEAD(io_bufs);
LIST_HEAD(log_bufs);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
/*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag_flag |= JBD2_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
- write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+ write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..19d74d86d99c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ if (jbd2_journal_has_csum_v2or3(journal) &&
+ JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
"at the same time!\n");
goto out;
}
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
+ JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ /* Can't have checksum v2 and v3 at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+ "at the same time!\n");
+ goto out;
+ }
+
if (!jbd2_verify_csum_type(journal, sb)) {
printk(KERN_ERR "JBD2: Unknown checksum type\n");
goto out;
}
/* Load the checksum driver */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ if (jbd2_journal_has_csum_v2or3(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Precompute checksum seed for all metadata */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
return 0;
- /* Asking for checksumming v2 and v1? Only give them v2. */
- if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+ /* If enabling v2 checksums, turn on v3 instead */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
+ incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
+ }
+
+ /* Asking for checksumming v3 and v1? Only give them v3. */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
- /* If enabling v2 checksums, update superblock */
- if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ /* If enabling v3 checksums, update superblock */
+ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
}
/* Precompute checksum seed for all metadata */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0,
sb->s_uuid,
sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
/* If enabling v1 checksums, downgrade superblock */
if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
sb->s_feature_incompat &=
- ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
+ JBD2_FEATURE_INCOMPAT_CSUM_V3);
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
*/
size_t journal_tag_bytes(journal_t *journal)
{
- journal_block_tag_t tag;
- size_t x = 0;
+ size_t sz;
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ return sizeof(journal_block_tag3_t);
+
+ sz = sizeof(journal_block_tag_t);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
- x += sizeof(tag.t_checksum);
+ sz += sizeof(__u16);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
- return x + JBD2_TAG_SIZE64;
+ return sz;
else
- return x + JBD2_TAG_SIZE32;
+ return sz - sizeof(__u32);
}
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..9b329b55ffe3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
size -= sizeof(struct jbd2_journal_block_tail);
tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
return err;
}
-static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+static inline unsigned long long read_tag_block(journal_t *journal,
+ journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (tag_bytes > JBD2_TAG_SIZE32)
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
+ journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
seq = cpu_to_be32(sequence);
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return tag->t_checksum == cpu_to_be16(csum32);
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ return tag3->t_checksum == cpu_to_be32(csum32);
+ else
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
+ int block_error = 0;
/*
* First thing is to establish what we expect to find in the log
@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
unsigned long long blocknr;
J_ASSERT(obh != NULL);
- blocknr = read_tag_block(tag_bytes,
+ blocknr = read_tag_block(journal,
tag);
/* If the block has been
@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
"checksum recovering "
"block %llu in log\n",
blocknr);
- continue;
+ block_error = 1;
+ goto skip_write;
}
/* Find a buffer for the new
@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
success = -EIO;
}
}
-
+ if (block_error && success == 0)
+ success = -EIO;
return success;
failed:
@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..d5e95a175c92 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,8 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
-#endif
#include <linux/log2.h>
+#endif
static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;
@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
offset = *offsetp;
/* Do we need to leave space at the end for a checksum? */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
/* Make sure we have a descriptor with space left for the record */
@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+ svcshare.o svcproc.o svcsubs.o mon.o xdr.o
lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ if (status == -ECONNREFUSED) {
+ dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
+ status);
+ rpc_force_rebind(clnt);
+ status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ }
if (status < 0)
dprintk("lockd: NSM upcall RPC failed, status=%d\n",
status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- struct list_head grace_list;
spinlock_t nsm_clnt_lock;
unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+ loff_t *pos)
+{
+ char *data;
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+
+ if (size < 1)
+ return -EINVAL;
+
+ data = simple_transaction_get(file, buf, size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ switch(data[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ locks_end_grace(&ln->lockd_manager);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return size;
+}
+
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+ loff_t *pos)
+{
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+ char resp[3];
+
+ resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+ resp[1] = '\n';
+ resp[2] = '\0';
+
+ return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+
+static const struct file_operations lockd_end_grace_operations = {
+ .write = nlm_end_grace_write,
+ .read = nlm_end_grace_read,
+ .llseek = default_llseek,
+ .release = simple_transaction_release,
+ .owner = THIS_MODULE,
+};
+
+int __init
+lockd_create_procfs(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/lockd", NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+ &lockd_end_grace_operations);
+ if (!entry) {
+ remove_proc_entry("fs/lockd", NULL);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void __exit
+lockd_remove_procfs(void)
+{
+ remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+ remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+ return 0;
+}
+
+static inline void
+lockd_remove_procfs(void)
+{
+ return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+
+#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
#include <linux/nfs.h>
#include "netns.h"
+#include "procfs.h"
#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
error = make_socks(serv, net);
if (error < 0)
- goto err_socks;
+ goto err_bind;
set_grace_period(net);
dprintk("lockd_up_net: per-net data created; net=%p\n", net);
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
err_bind:
ln->nlmsvc_users--;
return error;
@@ -306,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv)
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
+ nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
printk(KERN_WARNING
"lockd_up: kthread_run failed, error=%d\n", error);
goto out_task;
}
+ nlmsvc_rqst->rq_task = nlmsvc_task;
+ wake_up_process(nlmsvc_task);
+
dprintk("lockd_up: service started\n");
return 0;
@@ -583,7 +585,7 @@ static int lockd_init_net(struct net *net)
struct lockd_net *ln = net_generic(net, lockd_net_id);
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
- INIT_LIST_HEAD(&ln->grace_list);
+ INIT_LIST_HEAD(&ln->lockd_manager.list);
spin_lock_init(&ln->nsm_clnt_lock);
return 0;
}
@@ -617,8 +619,15 @@ static int __init init_nlm(void)
err = register_pernet_subsys(&lockd_net_ops);
if (err)
goto err_pernet;
+
+ err = lockd_create_procfs();
+ if (err)
+ goto err_procfs;
+
return 0;
+err_procfs:
+ unregister_pernet_subsys(&lockd_net_ops);
err_pernet:
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
@@ -631,6 +640,7 @@ static void __exit exit_nlm(void)
{
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
+ lockd_remove_procfs();
unregister_pernet_subsys(&lockd_net_ops);
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/locks.c b/fs/locks.c
index cb66fb05ad4a..bb08857f90b5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1619,7 +1619,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
smp_mb();
error = check_conflicting_open(dentry, arg);
if (error)
- locks_unlink_lock(flp);
+ locks_unlink_lock(before);
out:
if (is_deleg)
mutex_unlock(&inode->i_mutex);
diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..a7b05bf82d31 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
+#include <linux/hash.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd)
static __always_inline void set_root(struct nameidata *nd)
{
- if (!nd->root.mnt)
- get_fs_root(current->fs, &nd->root);
+ get_fs_root(current->fs, &nd->root);
}
static int link_path_walk(const char *, struct nameidata *);
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
{
- if (!nd->root.mnt) {
- struct fs_struct *fs = current->fs;
- unsigned seq;
+ struct fs_struct *fs = current->fs;
+ unsigned seq, res;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
- }
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ return res;
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
return PTR_ERR(s);
}
if (*s == '/') {
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->root);
@@ -1137,13 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
*/
*inode = path->dentry->d_inode;
}
- return read_seqretry(&mount_lock, nd->m_seq) &&
+ return !read_seqretry(&mount_lock, nd->m_seq) &&
!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
}
static int follow_dotdot_rcu(struct nameidata *nd)
{
- set_root_rcu(nd);
+ struct inode *inode = nd->inode;
+ if (!nd->root.mnt)
+ set_root_rcu(nd);
while (1) {
if (nd->path.dentry == nd->root.dentry &&
@@ -1155,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
struct dentry *parent = old->d_parent;
unsigned seq;
+ inode = parent->d_inode;
seq = read_seqcount_begin(&parent->d_seq);
if (read_seqcount_retry(&old->d_seq, nd->seq))
goto failed;
@@ -1164,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
}
if (!follow_up_rcu(&nd->path))
break;
+ inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
}
while (d_mountpoint(nd->path.dentry)) {
@@ -1173,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
break;
nd->path.mnt = &mounted->mnt;
nd->path.dentry = mounted->mnt.mnt_root;
+ inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- if (!read_seqretry(&mount_lock, nd->m_seq))
+ if (read_seqretry(&mount_lock, nd->m_seq))
goto failed;
}
- nd->inode = nd->path.dentry->d_inode;
+ nd->inode = inode;
return 0;
failed:
@@ -1256,7 +1261,8 @@ static void follow_mount(struct path *path)
static void follow_dotdot(struct nameidata *nd)
{
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1634,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
static inline unsigned int fold_hash(unsigned long hash)
{
- hash += hash >> (8*sizeof(int));
- return hash;
+ return hash_64(hash, 32);
}
#else /* 32-bit case */
@@ -1669,9 +1674,9 @@ EXPORT_SYMBOL(full_name_hash);
/*
* Calculate the length and hash of the path component, and
- * return the length of the component;
+ * return the "hash_len" as the result.
*/
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
{
unsigned long a, b, adata, bdata, mask, hash, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
@@ -1691,9 +1696,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
mask = create_zero_mask(adata | bdata);
hash += a & zero_bytemask(mask);
- *hashp = fold_hash(hash);
-
- return len + find_zero(mask);
+ len += find_zero(mask);
+ return hashlen_create(fold_hash(hash), len);
}
#else
@@ -1711,7 +1715,7 @@ EXPORT_SYMBOL(full_name_hash);
* We know there's a real path component here of at least
* one character.
*/
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
{
unsigned long hash = init_name_hash();
unsigned long len = 0, c;
@@ -1722,8 +1726,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
} while (c && c != '/');
- *hashp = end_name_hash(hash);
- return len;
+ return hashlen_create(end_name_hash(hash), len);
}
#endif
@@ -1748,20 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
/* At this point we know we have a real path component. */
for(;;) {
- struct qstr this;
- long len;
+ u64 hash_len;
int type;
err = may_lookup(nd);
if (err)
break;
- len = hash_name(name, &this.hash);
- this.name = name;
- this.len = len;
+ hash_len = hash_name(name);
type = LAST_NORM;
- if (name[0] == '.') switch (len) {
+ if (name[0] == '.') switch (hashlen_len(hash_len)) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
@@ -1775,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+ struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
if (err < 0)
break;
+ hash_len = this.hash_len;
+ name = this.name;
}
}
- nd->last = this;
+ nd->last.hash_len = hash_len;
+ nd->last.name = name;
nd->last_type = type;
- if (!name[len])
+ name += hashlen_len(hash_len);
+ if (!*name)
return 0;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
- len++;
- } while (unlikely(name[len] == '/'));
- if (!name[len])
+ name++;
+ } while (unlikely(*name == '/'));
+ if (!*name)
return 0;
- name += len;
-
err = walk_component(nd, &next, LOOKUP_FOLLOW);
if (err < 0)
return err;
@@ -1852,7 +1855,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (*name=='/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
- set_root_rcu(nd);
+ nd->seq = set_root_rcu(nd);
} else {
set_root(nd);
path_get(&nd->root);
@@ -1903,7 +1906,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
}
nd->inode = nd->path.dentry->d_inode;
- return 0;
+ if (!(flags & LOOKUP_RCU))
+ return 0;
+ if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+ return 0;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ return -ECHILD;
}
static inline int lookup_last(struct nameidata *nd, struct path *path)
diff --git a/fs/namespace.c b/fs/namespace.c
index a01c7730e9af..ef42d9bee212 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1217,6 +1217,11 @@ static void namespace_unlock(void)
head.first->pprev = &head.first;
INIT_HLIST_HEAD(&unmounted);
+ /* undo decrements we'd done in umount_tree() */
+ hlist_for_each_entry(mnt, &head, mnt_hash)
+ if (mnt->mnt_ex_mountpoint.mnt)
+ mntget(mnt->mnt_ex_mountpoint.mnt);
+
up_write(&namespace_sem);
synchronize_rcu();
@@ -1253,6 +1258,9 @@ void umount_tree(struct mount *mnt, int how)
hlist_add_head(&p->mnt_hash, &tmp_list);
}
+ hlist_for_each_entry(p, &tmp_list, mnt_hash)
+ list_del_init(&p->mnt_child);
+
if (how)
propagate_umount(&tmp_list);
@@ -1263,9 +1271,9 @@ void umount_tree(struct mount *mnt, int how)
p->mnt_ns = NULL;
if (how < 2)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
- list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
put_mountpoint(p->mnt_mp);
+ mnt_add_count(p->mnt_parent, -1);
/* move the reference to mountpoint into ->mnt_ex_mountpoint */
p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h> /* struct bio */
-#include <linux/buffer_head.h> /* various write calls */
#include <linux/prefetch.h>
#include <linux/pagevec.h>
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-static void print_page(struct page *page)
+static bool is_hole(struct pnfs_block_extent *be)
{
- dprintk("PRINTPAGE page %p\n", page);
- dprintk(" PagePrivate %d\n", PagePrivate(page));
- dprintk(" PageUptodate %d\n", PageUptodate(page));
- dprintk(" PageError %d\n", PageError(page));
- dprintk(" PageDirty %d\n", PageDirty(page));
- dprintk(" PageReferenced %d\n", PageReferenced(page));
- dprintk(" PageLocked %d\n", PageLocked(page));
- dprintk(" PageWriteback %d\n", PageWriteback(page));
- dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
- dprintk("\n");
-}
-
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
- if (be->be_state == PNFS_BLOCK_NONE_DATA)
- return 1;
- else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
- return 0;
- else
- return !bl_is_sector_init(be->be_inval, isect);
-}
-
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
-{
- return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA);
+ switch (be->be_state) {
+ case PNFS_BLOCK_NONE_DATA:
+ return true;
+ case PNFS_BLOCK_INVALID_DATA:
+ return be->be_tag ? false : true;
+ default:
+ return false;
+ }
}
/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
*/
struct parallel_io {
struct kref refcnt;
- void (*pnfs_callback) (void *data, int num_se);
+ void (*pnfs_callback) (void *data);
void *data;
- int bse_count;
};
static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
- rv->bse_count = 0;
}
return rv;
}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
- p->pnfs_callback(p->data, p->bse_count);
+ p->pnfs_callback(p->data);
kfree(p);
}
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+ void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
struct bio *bio;
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}
if (bio) {
- bio->bi_iter.bi_sector = isect - be->be_f_offset +
- be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
+ bio->bi_iter.bi_sector = disk_sector;
+ bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par,
- unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par, unsigned int offset, int *len)
{
- isect = isect + (offset >> SECTOR_SHIFT);
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
- npg, rw, (unsigned long long)isect, offset, len);
+ npg, rw, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+ if (!dev->map(dev, disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(rw, bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ bio = bl_alloc_init_bio(npg, map->bdev,
+ disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, len, offset) < len) {
+ if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
-{
- return do_add_page_to_bio(bio, npg, rw, isect, page, be,
- end_io, par, 0, PAGE_CACHE_SIZE);
-}
-
-/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- if (!err)
- bio_for_each_segment_all(bvec, bio, i)
- SetPageUptodate(bvec->bv_page);
if (err) {
struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
}
+
bio_put(bio);
put_parallel(par);
}
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
}
static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
{
struct nfs_pgio_header *hdr = data;
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
}
static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_header *hdr)
+bl_read_pagelist(struct nfs_pgio_header *header)
{
- struct nfs_pgio_header *header = hdr;
- int i, hole;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
struct parallel_io *par;
- loff_t f_offset = hdr->args.offset;
- size_t bytes_left = hdr->args.count;
+ loff_t f_offset = header->args.offset;
+ size_t bytes_left = header->args.count;
unsigned int pg_offset, pg_len;
- struct page **pages = hdr->args.pages;
- int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
+ struct blk_plug plug;
+ int i;
dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
- hdr->page_array.npages, f_offset,
- (unsigned int)hdr->args.count);
+ header->page_array.npages, f_offset,
+ (unsigned int)header->args.count);
- par = alloc_parallel(hdr);
+ par = alloc_parallel(header);
if (!par)
- goto use_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_read;
- /* At this point, we can no longer jump to use_mds */
+
+ blk_start_plug(&plug);
isect = (sector_t) (f_offset >> SECTOR_SHIFT);
/* Code assumes extents are page-aligned */
- for (i = pg_index; i < hdr->page_array.npages; i++) {
- if (!extent_length) {
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(READ, bio);
+
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be) {
+ if (!ext_tree_lookup(bl, isect, &be, false)) {
header->pnfs_error = -EIO;
goto out;
}
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- if (cow_read) {
- sector_t cow_length = cow_read->be_length -
- (isect - cow_read->be_f_offset);
- extent_length = min(extent_length, cow_length);
- }
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
+ pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
-
- f_offset += pg_len;
- bytes_left -= pg_len;
- isect += (pg_offset >> SECTOR_SHIFT);
} else {
- pg_offset = 0;
+ BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}
- hole = is_hole(be, isect);
- if (hole && !cow_read) {
+ isect += (pg_offset >> SECTOR_SHIFT);
+ extent_length -= (pg_offset >> SECTOR_SHIFT);
+
+ if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
- print_page(pages[i]);
- SetPageUptodate(pages[i]);
- } else {
- struct pnfs_block_extent *be_read;
- be_read = (hole && cow_read) ? cow_read : be;
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
+ } else {
bio = do_add_page_to_bio(bio,
- hdr->page_array.npages - i,
+ header->page_array.npages - i,
READ,
- isect, pages[i], be_read,
+ isect, pages[i], &map, &be,
bl_end_io_read, par,
- pg_offset, pg_len);
+ pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
}
}
isect += (pg_len >> SECTOR_SHIFT);
- extent_length -= PAGE_CACHE_SECTORS;
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
- hdr->res.eof = 1;
- hdr->res.count = header->inode->i_size - hdr->args.offset;
+ header->res.eof = 1;
+ header->res.count = header->inode->i_size - header->args.offset;
} else {
- hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
+ header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
}
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(READ, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-
- use_mds:
- dprintk("Giving up and using normal NFS\n");
- return PNFS_NOT_ATTEMPTED;
-}
-
-static void mark_extents_written(struct pnfs_block_layout *bl,
- __u64 offset, __u32 count)
-{
- sector_t isect, end;
- struct pnfs_block_extent *be;
- struct pnfs_block_short_extent *se;
-
- dprintk("%s(%llu, %u)\n", __func__, offset, count);
- if (count == 0)
- return;
- isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
- end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
- end >>= SECTOR_SHIFT;
- while (isect < end) {
- sector_t len;
- be = bl_find_get_extent(bl, isect, NULL);
- BUG_ON(!be); /* FIXME */
- len = min(end, be->be_f_offset + be->be_length) - isect;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- se = bl_pop_one_short_extent(be->be_inval);
- BUG_ON(!se);
- bl_mark_for_commit(be, isect, len, se);
- }
- isect += len;
- bl_put_extent(be);
- }
-}
-
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
- struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- /* This is the zeroing page we added */
- end_page_writeback(bvec->bv_page);
- page_cache_release(bvec->bv_page);
- }
-
- if (unlikely(err)) {
- struct nfs_pgio_header *header = par->data;
-
- if (!header->pnfs_error)
- header->pnfs_error = -EIO;
- pnfs_set_lo_fail(header->lseg);
- }
- bio_put(bio);
- put_parallel(par);
}
static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
*/
static void bl_write_cleanup(struct work_struct *work)
{
- struct rpc_task *task;
- struct nfs_pgio_header *hdr;
+ struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+ struct nfs_pgio_header *hdr =
+ container_of(task, struct nfs_pgio_header, task);
+
dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- hdr = container_of(task, struct nfs_pgio_header, task);
+
if (likely(!hdr->pnfs_error)) {
- /* Marks for LAYOUTCOMMIT */
- mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
- hdr->args.offset, hdr->args.count);
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+ u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+ u64 end = (hdr->args.offset + hdr->args.count +
+ PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+
+ ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+ (end - start) >> SECTOR_SHIFT);
}
+
pnfs_ld_write_done(hdr);
}
/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
+static void bl_end_par_io_write(void *data)
{
struct nfs_pgio_header *hdr = data;
- if (unlikely(hdr->pnfs_error)) {
- bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
- num_se);
- }
-
hdr->task.tk_status = hdr->pnfs_error;
hdr->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
schedule_work(&hdr->task.u.tk_work);
}
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
- return;
-}
-
-/*
- * map_block: map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
- dprintk("%s enter be=%p\n", __func__, be);
-
- set_buffer_mapped(bh);
- bh->b_bdev = be->be_mdev;
- bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
- (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-
- dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
- __func__, (unsigned long long)isect, (long)bh->b_blocknr,
- bh->b_size);
- return;
-}
-
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct page *page = bvec->bv_page;
-
- /* Only one page in bvec */
- unlock_page(page);
-}
-
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int offset, unsigned int len)
-{
- struct bio *bio;
- struct page *shadow_page;
- sector_t isect;
- char *kaddr, *kshadow_addr;
- int ret = 0;
-
- dprintk("%s: offset %u len %u\n", __func__, offset, len);
-
- shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (shadow_page == NULL)
- return -ENOMEM;
-
- bio = bio_alloc(GFP_NOIO, 1);
- if (bio == NULL)
- return -ENOMEM;
-
- isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
- (offset / SECTOR_SIZE);
-
- bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = bl_read_single_end_io;
-
- lock_page(shadow_page);
- if (bio_add_page(bio, shadow_page,
- SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
- unlock_page(shadow_page);
- bio_put(bio);
- return -EIO;
- }
-
- submit_bio(READ, bio);
- wait_on_page_locked(shadow_page);
- if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
- ret = -EIO;
- } else {
- kaddr = kmap_atomic(page);
- kshadow_addr = kmap_atomic(shadow_page);
- memcpy(kaddr + offset, kshadow_addr + offset, len);
- kunmap_atomic(kshadow_addr);
- kunmap_atomic(kaddr);
- }
- __free_page(shadow_page);
- bio_put(bio);
-
- return ret;
-}
-
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int dirty_offset, unsigned int dirty_len,
- bool full_page)
-{
- int ret = 0;
- unsigned int start, end;
-
- if (full_page) {
- start = 0;
- end = PAGE_CACHE_SIZE;
- } else {
- start = round_down(dirty_offset, SECTOR_SIZE);
- end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
- }
-
- dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
- if (!be) {
- zero_user_segments(page, start, dirty_offset,
- dirty_offset + dirty_len, end);
- if (start == 0 && end == PAGE_CACHE_SIZE &&
- trylock_page(page)) {
- SetPageUptodate(page);
- unlock_page(page);
- }
- return ret;
- }
-
- if (start != dirty_offset)
- ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-
- if (!ret && (dirty_offset + dirty_len < end))
- ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
- end - dirty_offset - dirty_len);
-
- return ret;
-}
-
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
-{
- struct buffer_head *bh = NULL;
- int ret = 0;
- sector_t isect;
-
- dprintk("%s enter, %p\n", __func__, page);
- BUG_ON(PageUptodate(page));
- if (!cow_read) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- goto cleanup;
- }
-
- bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
- if (!bh) {
- ret = -ENOMEM;
- goto cleanup;
- }
-
- isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
- map_block(bh, isect, cow_read);
- if (!bh_uptodate_or_lock(bh))
- ret = bh_submit_read(bh);
- if (ret)
- goto cleanup;
- SetPageUptodate(page);
-
-cleanup:
- if (bh)
- free_buffer_head(bh);
- if (ret) {
- /* Need to mark layout with bad read...should now
- * just use nfs4 for reads and writes.
- */
- mark_bad_read();
- }
- return ret;
-}
-
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
- struct pnfs_block_extent *cow_read)
-{
- struct page *page;
- int locked = 0;
- page = find_get_page(inode->i_mapping, index);
- if (page)
- goto check_page;
-
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
- if (unlikely(!page)) {
- dprintk("%s oom\n", __func__);
- return ERR_PTR(-ENOMEM);
- }
- locked = 1;
-
-check_page:
- /* PageDirty: Other will write this out
- * PageWriteback: Other is writing this out
- * PageUptodate: It was read before
- */
- if (PageDirty(page) || PageWriteback(page)) {
- print_page(page);
- if (locked)
- unlock_page(page);
- page_cache_release(page);
- return NULL;
- }
-
- if (!locked) {
- lock_page(page);
- locked = 1;
- goto check_page;
- }
- if (!PageUptodate(page)) {
- /* New page, readin or zero it */
- init_page_for_write(page, cow_read);
- }
- set_page_writeback(page);
- unlock_page(page);
-
- return page;
-}
-
static enum pnfs_try_status
bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
- int i, ret, npg_zero, pg_index, last = 0;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
- sector_t isect, last_isect = 0, extent_length = 0;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
struct parallel_io *par = NULL;
loff_t offset = header->args.offset;
size_t count = header->args.count;
- unsigned int pg_offset, pg_len, saved_len;
struct page **pages = header->args.pages;
- struct page *page;
- pgoff_t index;
- u64 temp;
- int npg_per_block =
- NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+ int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ unsigned int pg_len;
+ struct blk_plug plug;
+ int i;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
- if (header->dreq != NULL &&
- (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
- !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
- dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
- goto out_mds;
- }
/* At this point, header->page_aray is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
*/
par = alloc_parallel(header);
if (!par)
- goto out_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_write;
- /* At this point, have to be more careful with error handling */
- isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
- dprintk("%s no matching extents!\n", __func__);
- goto out_mds;
- }
+ blk_start_plug(&plug);
- /* First page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else
- goto out_mds;
- temp = offset >> PAGE_CACHE_SHIFT;
- npg_zero = do_div(temp, npg_per_block);
- isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
- (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- extent_length = be->be_length - (isect - be->be_f_offset);
-
-fill_invalid_ext:
- dprintk("%s need to zero %d pages\n", __func__, npg_zero);
- for (;npg_zero > 0; npg_zero--) {
- if (bl_is_sector_init(be->be_inval, isect)) {
- dprintk("isect %llu already init\n",
- (unsigned long long)isect);
- goto next_page;
- }
- /* page ref released in bl_end_io_write_zero */
- index = isect >> PAGE_CACHE_SECTOR_SHIFT;
- dprintk("%s zero %dth page: index %lu isect %llu\n",
- __func__, npg_zero, index,
- (unsigned long long)isect);
- page = bl_find_get_zeroing_page(header->inode, index,
- cow_read);
- if (unlikely(IS_ERR(page))) {
- header->pnfs_error = PTR_ERR(page);
- goto out;
- } else if (page == NULL)
- goto next_page;
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = ret;
- goto out;
- }
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else {
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- /* FIXME: This should be done in bi_end_io */
- mark_extents_written(BLK_LSEG2EXT(header->lseg),
- page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE);
-
- bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
- isect, page, be,
- bl_end_io_write_zero, par);
- if (IS_ERR(bio)) {
- header->pnfs_error = PTR_ERR(bio);
- bio = NULL;
- goto out;
- }
-next_page:
- isect += PAGE_CACHE_SECTORS;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- if (last)
- goto write_done;
- }
- bio = bl_submit_bio(WRITE, bio);
+ /* we always write out the whole page */
+ offset = offset & (loff_t)PAGE_CACHE_MASK;
+ isect = offset >> SECTOR_SHIFT;
- /* Middle pages */
- pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
for (i = pg_index; i < header->page_array.npages; i++) {
- if (!extent_length) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
+ if (!ext_tree_lookup(bl, isect, &be, true)) {
header->pnfs_error = -EINVAL;
goto out;
}
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(
- be->be_inval)))
- par->bse_count++;
- else {
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- }
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- }
-
- dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
- pg_offset = offset & ~PAGE_CACHE_MASK;
- if (pg_offset + count > PAGE_CACHE_SIZE)
- pg_len = PAGE_CACHE_SIZE - pg_offset;
- else
- pg_len = count;
-
- saved_len = pg_len;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
- !bl_is_sector_init(be->be_inval, isect)) {
- ret = bl_read_partial_page_sync(pages[i], cow_read,
- pg_offset, pg_len, true);
- if (ret) {
- dprintk("%s bl_read_partial_page_sync fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
- /* Expand to full page write */
- pg_offset = 0;
- pg_len = PAGE_CACHE_SIZE;
- } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
- (pg_len & (SECTOR_SIZE - 1))){
- /* ahh, nasty case. We have to do sync full sector
- * read-modify-write cycles.
- */
- unsigned int saved_offset = pg_offset;
- ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
- pg_len, false);
- pg_offset = round_down(pg_offset, SECTOR_SIZE);
- pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
- - pg_offset;
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
-
+ pg_len = PAGE_CACHE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
- WRITE,
- isect, pages[i], be,
+ WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
- pg_offset, pg_len);
+ 0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
- offset += saved_len;
- count -= saved_len;
- isect += PAGE_CACHE_SECTORS;
- last_isect = isect;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- /* Last page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- bio = bl_submit_bio(WRITE, bio);
- temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
- npg_zero = npg_per_block - do_div(temp, npg_per_block);
- if (npg_zero < npg_per_block) {
- last = 1;
- goto fill_invalid_ext;
- }
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
}
-write_done:
header->res.count = header->args.count;
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(WRITE, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-out_mds:
- bl_put_extent(be);
- bl_put_extent(cow_read);
- kfree(par);
- return PNFS_NOT_ATTEMPTED;
-}
-
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
- int i;
- struct pnfs_block_extent *be;
-
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- while (!list_empty(&bl->bl_extents[i])) {
- be = list_first_entry(&bl->bl_extents[i],
- struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- }
- spin_unlock(&bl->bl_ext_lock);
-}
-
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
- struct pnfs_inval_tracking *pos, *temp;
- struct pnfs_block_short_extent *se, *stemp;
-
- list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
- list_del(&pos->it_link);
- kfree(pos);
- }
-
- list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- }
- return;
}
static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int err;
dprintk("%s enter\n", __func__);
- release_extents(bl, NULL);
- release_inval_marks(&bl->bl_inval);
+
+ err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+ WARN_ON(err);
+
kfree(bl);
}
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl = kzalloc(sizeof(*bl), gfp_flags);
if (!bl)
return NULL;
+
+ bl->bl_ext_rw = RB_ROOT;
+ bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
- INIT_LIST_HEAD(&bl->bl_extents[0]);
- INIT_LIST_HEAD(&bl->bl_extents[1]);
- INIT_LIST_HEAD(&bl->bl_commit);
- INIT_LIST_HEAD(&bl->bl_committing);
- bl->bl_count = 0;
- bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
- BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+
return &bl->bl_layout;
}
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
kfree(lseg);
}
-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- struct pnfs_layout_segment *lseg;
- int status;
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
- dprintk("%s enter\n", __func__);
- lseg = kzalloc(sizeof(*lseg), gfp_flags);
- if (!lseg)
- return ERR_PTR(-ENOMEM);
- status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
- if (status) {
- /* We don't want to call the full-blown bl_free_lseg,
- * since on error extents were not touched.
- */
- kfree(lseg);
- return ERR_PTR(status);
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
}
- return lseg;
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
}
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
+static int decode_sector_number(__be32 **rp, sector_t *sp)
{
- dprintk("%s enter\n", __func__);
- encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
}
-static void
-bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
{
- struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
+ __be32 *p;
- dprintk("%s enter\n", __func__);
- clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
-}
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
-static void free_blk_mountid(struct block_mount_id *mid)
-{
- if (mid) {
- struct pnfs_block_dev *dev, *tmp;
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
- /* No need to take bm_lock as we are last user freeing bm_devlist */
- list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
- list_del(&dev->bm_node);
- bl_free_block_dev(dev);
- }
- kfree(mid);
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ error = -EIO;
+ be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (!be->be_device)
+ goto out_free_be;
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
}
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
}
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
- struct nfs4_deviceid *d_id)
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
{
- struct pnfs_device *dev;
- struct pnfs_block_dev *rv;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- int i, rc;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+ if (unlikely(!p))
+ goto out_free_scratch;
+
+ count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);
/*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
*/
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s max_resp_sz %u max_pages %d\n",
- __func__, max_resp_sz, max_pages);
-
- dev = kmalloc(sizeof(*dev), GFP_NOFS);
- if (!dev) {
- dprintk("%s kmalloc failed\n", __func__);
- return ERR_PTR(-ENOMEM);
+ for (i = 0; i < count; i++) {
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
}
- pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ status = -EIO;
+ goto process_extents;
}
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- rv = ERR_PTR(-ENOMEM);
- goto out_free;
- }
+
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ status = -EIO;
}
- memcpy(&dev->dev_id, d_id, sizeof(*d_id));
- dev->layout_type = LAYOUT_BLOCK_VOLUME;
- dev->pages = pages;
- dev->pgbase = 0;
- dev->pglen = PAGE_SIZE * max_pages;
- dev->mincount = 0;
- dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc) {
- rv = ERR_PTR(rc);
- goto out_free;
+process_extents:
+ while (!list_empty(&extents)) {
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+
+ if (!status)
+ status = ext_tree_insert(bl, be);
+
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
}
- rv = nfs4_blk_decode_device(server, dev);
- out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(dev);
- return rv;
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ if (status) {
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
}
-static int
-bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
- struct block_mount_id *b_mt_id = NULL;
- struct pnfs_devicelist *dlist = NULL;
- struct pnfs_block_dev *bdev;
- LIST_HEAD(block_disklist);
- int status, i;
-
- dprintk("%s enter\n", __func__);
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ sector_t offset = range->offset >> SECTOR_SHIFT, end;
- if (server->pnfs_blksize == 0) {
- dprintk("%s Server did not return blksize\n", __func__);
- return -EINVAL;
- }
- b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
- if (!b_mt_id) {
- status = -ENOMEM;
- goto out_error;
- }
- /* Initialize nfs4 block layout mount id */
- spin_lock_init(&b_mt_id->bm_lock);
- INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
- dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
- if (!dlist) {
- status = -ENOMEM;
- goto out_error;
+ if (range->offset % 8) {
+ dprintk("%s: offset %lld not block size aligned\n",
+ __func__, range->offset);
+ return;
}
- dlist->eof = 0;
- while (!dlist->eof) {
- status = nfs4_proc_getdevicelist(server, fh, dlist);
- if (status)
- goto out_error;
- dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
- __func__, dlist->num_devs, dlist->eof);
- for (i = 0; i < dlist->num_devs; i++) {
- bdev = nfs4_blk_get_deviceinfo(server, fh,
- &dlist->dev_id[i]);
- if (IS_ERR(bdev)) {
- status = PTR_ERR(bdev);
- goto out_error;
- }
- spin_lock(&b_mt_id->bm_lock);
- list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
- spin_unlock(&b_mt_id->bm_lock);
+
+ if (range->length != NFS4_MAX_UINT64) {
+ if (range->length % 8) {
+ dprintk("%s: length %lld not block size aligned\n",
+ __func__, range->length);
+ return;
}
- }
- dprintk("%s SUCCESS\n", __func__);
- server->pnfs_ld_data = b_mt_id;
- out_return:
- kfree(dlist);
- return status;
+ end = offset + (range->length >> SECTOR_SHIFT);
+ } else {
+ end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+ }
- out_error:
- free_blk_mountid(b_mt_id);
- goto out_return;
+ ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
}
static int
-bl_clear_layoutdriver(struct nfs_server *server)
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+ return ext_tree_prepare_commit(arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
{
- struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+ ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
dprintk("%s enter\n", __func__);
- free_blk_mountid(b_mt_id);
- dprintk("%s RETURNS\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ if (server->pnfs_blksize > PAGE_SIZE) {
+ printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+ __func__, server->pnfs_blksize);
+ return -EINVAL;
+ }
+
return 0;
}
static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, unsigned int alignment)
{
- return IS_ALIGNED(req->wb_offset, alignment) &&
- IS_ALIGNED(req->wb_bytes, alignment);
+ /*
+ * Always accept buffered writes, higher layers take care of the
+ * right alignment.
+ */
+ if (pgio->pg_dreq == NULL)
+ return true;
+
+ if (!IS_ALIGNED(req->wb_offset, alignment))
+ return false;
+
+ if (IS_ALIGNED(req->wb_bytes, alignment))
+ return true;
+
+ if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+ /*
+ * If the write goes up to the inode size, just write
+ * the full page. Data past the inode size is
+ * guaranteed to be zeroed by the higher level client
+ * code, and this behaviour is mandated by RFC 5663
+ * section 2.3.2.
+ */
+ return true;
+ }
+
+ return false;
}
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
nfs_pageio_reset_read_mds(pgio);
- else
- pnfs_generic_pg_init_read(pgio, req);
+ return;
+ }
+
+ pnfs_generic_pg_init_read(pgio, req);
}
/*
@@ -1196,10 +796,8 @@ static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
static void
bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+ u64 wb_size;
+
+ if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
nfs_pageio_reset_write_mds(pgio);
- } else {
- u64 wb_size;
- if (pgio->pg_dreq == NULL)
- wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
- req->wb_index);
- else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pnfs_generic_pg_init_write(pgio, req, wb_size);
+ return;
}
+
+ if (pgio->pg_dreq == NULL)
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+ req->wb_index);
+ else
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pnfs_generic_pg_init_write(pgio, req, wb_size);
}
/*
@@ -1252,10 +851,8 @@ static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE))
+ if (!is_aligned_req(pgio, req, PAGE_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
.owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_READ_WHOLE_PAGE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
.free_layout_hdr = bl_free_layout_hdr,
.alloc_lseg = bl_alloc_lseg,
.free_lseg = bl_free_lseg,
- .encode_layoutcommit = bl_encode_layoutcommit,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
- .clear_layoutdriver = bl_clear_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
};
-static const struct rpc_pipe_ops bl_upcall_ops = {
- .upcall = rpc_pipe_generic_upcall,
- .downcall = bl_pipe_downcall,
- .destroy_msg = bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- struct dentry *dir, *dentry;
-
- dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
- if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
- dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
- void *ptr)
-{
- struct super_block *sb = ptr;
- struct net *net = sb->s_fs_info;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return 0;
-
- if (nn->bl_device_pipe == NULL) {
- module_put(THIS_MODULE);
- return 0;
- }
-
- switch (event) {
- case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
- break;
- case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
- break;
- default:
- ret = -ENOTSUPP;
- break;
- }
- module_put(THIS_MODULE);
- return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
- .notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
- struct dentry *dentry;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- }
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
-
- init_waitqueue_head(&nn->bl_wq);
- nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
- if (IS_ERR(nn->bl_device_pipe))
- return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
- .init = nfs4blocklayout_net_init,
- .exit = nfs4blocklayout_net_exit,
-};
-
static int __init nfs4blocklayout_init(void)
{
int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out;
-
- ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ ret = bl_init_pipefs();
if (ret)
- goto out_remove;
- ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
- if (ret)
- goto out_notifier;
-out:
- return ret;
+ goto out_unregister;
+ return 0;
-out_notifier:
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
return ret;
}
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
- unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+ bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&blocklayout_type);
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-struct block_mount_id {
- spinlock_t bm_lock; /* protects list */
- struct list_head bm_devlist; /* holds pnfs_block_dev */
-};
+struct pnfs_block_dev;
-struct pnfs_block_dev {
- struct list_head bm_node;
- struct nfs4_deviceid bm_mdevid; /* associated devid */
- struct block_device *bm_mdev; /* meta device itself */
- struct net *net;
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+#define PNFS_BLOCK_MAX_UUIDS 4
+#define PNFS_BLOCK_MAX_DEVICES 64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } stripe;
+ };
};
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct pnfs_block_dev_map {
+ sector_t start;
+ sector_t len;
-struct my_tree {
- sector_t mtt_step_size; /* Internal sector alignment */
- struct list_head mtt_stub; /* Should be a radix tree */
+ sector_t disk_offset;
+ struct block_device *bdev;
};
-struct pnfs_inval_markings {
- spinlock_t im_lock;
- struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
- sector_t im_block_size; /* Server blocksize in sectors */
- struct list_head im_extents; /* Short extents for INVAL->RW conversion */
+struct pnfs_block_dev {
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
};
-struct pnfs_inval_tracking {
- struct list_head it_link;
- int it_sector;
- int it_tags;
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
- struct kref be_refcnt;
- struct list_head be_node; /* link into lseg list */
- struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
- struct block_device *be_mdev;
+ union {
+ struct rb_node be_node;
+ struct list_head be_list;
+ };
+ struct nfs4_deviceid_node *be_device;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
enum exstate4 be_state; /* the state of this extent */
- struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+#define EXTENT_WRITTEN 1
+#define EXTENT_COMMITTING 2
+ unsigned int be_tag;
};
-/* Shortened extent used by LAYOUTCOMMIT */
-struct pnfs_block_short_extent {
- struct list_head bse_node;
- struct nfs4_deviceid bse_devid;
- struct block_device *bse_mdev;
- sector_t bse_f_offset; /* the starting offset in the file */
- sector_t bse_length; /* the size of the extent */
-};
-
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
- spin_lock_init(&marks->im_lock);
- INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
- INIT_LIST_HEAD(&marks->im_extents);
- marks->im_block_size = blocksize;
- marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
- blocksize);
-}
-
-enum extentclass4 {
- RW_EXTENT = 0, /* READWRTE and INVAL */
- RO_EXTENT = 1, /* READ and NONE */
- EXTENT_LISTS = 2,
-};
-
-static inline int bl_choose_list(enum exstate4 state)
-{
- if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
- return RO_EXTENT;
- else
- return RW_EXTENT;
-}
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
struct pnfs_block_layout {
- struct pnfs_layout_hdr bl_layout;
- struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ struct pnfs_layout_hdr bl_layout;
+ struct rb_root bl_ext_rw;
+ struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
- struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
- struct list_head bl_commit; /* Needs layout commit */
- struct list_head bl_committing; /* Layout committing */
- unsigned int bl_count; /* entries in bl_commit */
- sector_t bl_blocksize; /* Server blocksize in sectors */
};
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
static inline struct pnfs_block_layout *
BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
-
-/* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
-
-/* extents.c */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length);
-void bl_put_extent(struct pnfs_block_extent *be);
-struct pnfs_block_extent *bl_alloc_extent(void);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+ sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- * Device operations for the pnfs nfs4 file layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
- uint64_t s;
-
- *rp = xdr_decode_hyper(*rp, &s);
- if (s & 0x1ff) {
- printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
- return -1;
- }
- *sp = s >> SECTOR_SHIFT;
- return 0;
-}
-
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
- dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- blkdev_put(bdev, FMODE_READ);
-}
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
- size_t mlen)
-{
- struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
- nfs_net_id);
-
- if (mlen != sizeof (struct bl_dev_msg))
- return -EINVAL;
-
- if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
- return -EFAULT;
-
- wake_up(&nn->bl_wq);
-
- return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
- struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
- if (msg->errno >= 0)
- return;
- wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev)
-{
- struct pnfs_block_dev *rv;
- struct block_device *bd = NULL;
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_MOUNT,
- .totallen = dev->mincount,
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- int offset, len, i, rc;
- struct net *net = server->nfs_client->cl_net;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
- dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
- dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
- dev->mincount);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
- if (!msg->data) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- len = dev->mincount;
- offset = sizeof(bl_msg);
- for (i = 0; len > 0; i++) {
- memcpy(&dataptr[offset], page_address(dev->pages[i]),
- len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- offset += PAGE_CACHE_SIZE;
- }
- msg->len = sizeof(bl_msg) + dev->mincount;
-
- dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
- add_wait_queue(&nn->bl_wq, &wq);
- rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
- if (rc < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- rv = ERR_PTR(rc);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
- if (reply->status != BL_DEVICE_REQUEST_PROC) {
- dprintk("%s failed to open device: %d\n",
- __func__, reply->status);
- rv = ERR_PTR(-EINVAL);
- goto out;
- }
-
- bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
- FMODE_READ, NULL);
- if (IS_ERR(bd)) {
- dprintk("%s failed to open device : %ld\n", __func__,
- PTR_ERR(bd));
- rv = ERR_CAST(bd);
- goto out;
- }
-
- rv = kzalloc(sizeof(*rv), GFP_NOFS);
- if (!rv) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- rv->bm_mdev = bd;
- memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
- rv->net = net;
- dprintk("%s Created device %s with bd_block_size %u\n",
- __func__,
- bd->bd_disk->disk_name,
- bd->bd_block_size);
-
-out:
- kfree(msg->data);
- return rv;
-}
-
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
- struct nfs4_deviceid *id)
-{
- struct block_device *rv = NULL;
- struct block_mount_id *mid;
- struct pnfs_block_dev *dev;
-
- dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
- mid = BLK_ID(lo);
- spin_lock(&mid->bm_lock);
- list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
- if (memcmp(id->data, dev->bm_mdevid.data,
- NFS4_DEVICEID4_SIZE) == 0) {
- rv = dev->bm_mdev;
- goto out;
- }
- }
- out:
- spin_unlock(&mid->bm_lock);
- dprintk("%s returning %p\n", __func__, rv);
- return rv;
-}
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
- u32 mode; /* R or RW */
- u64 start; /* Expected start of next non-COW extent */
- u64 inval; /* Start of INVAL coverage */
- u64 cowread; /* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
- struct layout_verification *lv)
-{
- if (lv->mode == IOMODE_READ) {
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA)
- return -EIO;
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- }
- /* lv->mode == IOMODE_RW */
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- if (lv->cowread > lv->start)
- return -EIO;
- lv->start += be->be_length;
- lv->inval = lv->start;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
- if (be->be_f_offset > lv->start)
- return -EIO;
- if (be->be_f_offset < lv->inval)
- return -EIO;
- if (be->be_f_offset < lv->cowread)
- return -EIO;
- /* It looks like you might want to min this with lv->start,
- * but you really don't.
- */
- lv->inval = lv->inval + be->be_length;
- lv->cowread = be->be_f_offset + be->be_length;
- return 0;
- } else
- return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
- struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
- int i, status = -EIO;
- uint32_t count;
- struct pnfs_block_extent *be = NULL, *save;
- struct xdr_stream stream;
- struct xdr_buf buf;
- struct page *scratch;
- __be32 *p;
- struct layout_verification lv = {
- .mode = lgr->range.iomode,
- .start = lgr->range.offset >> SECTOR_SHIFT,
- .inval = lgr->range.offset >> SECTOR_SHIFT,
- .cowread = lgr->range.offset >> SECTOR_SHIFT,
- };
- LIST_HEAD(extents);
-
- dprintk("---> %s\n", __func__);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- return -ENOMEM;
-
- xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- p = xdr_inline_decode(&stream, 4);
- if (unlikely(!p))
- goto out_err;
-
- count = be32_to_cpup(p++);
-
- dprintk("%s enter, number of extents %i\n", __func__, count);
- p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
- if (unlikely(!p))
- goto out_err;
-
- /* Decode individual extents, putting them in temporary
- * staging area until whole layout is decoded to make error
- * recovery easier.
- */
- for (i = 0; i < count; i++) {
- be = bl_alloc_extent();
- if (!be) {
- status = -ENOMEM;
- goto out_err;
- }
- memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
- p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- be->be_mdev = translate_devid(lo, &be->be_devid);
- if (!be->be_mdev)
- goto out_err;
-
- /* The next three values are read in as bytes,
- * but stored as 512-byte sector lengths
- */
- if (decode_sector_number(&p, &be->be_f_offset) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_length) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_v_offset) < 0)
- goto out_err;
- be->be_state = be32_to_cpup(p++);
- if (be->be_state == PNFS_BLOCK_INVALID_DATA)
- be->be_inval = &bl->bl_inval;
- if (verify_extent(be, &lv)) {
- dprintk("%s verify failed\n", __func__);
- goto out_err;
- }
- list_add_tail(&be->be_node, &extents);
- }
- if (lgr->range.offset + lgr->range.length !=
- lv.start << SECTOR_SHIFT) {
- dprintk("%s Final length mismatch\n", __func__);
- be = NULL;
- goto out_err;
- }
- if (lv.start < lv.cowread) {
- dprintk("%s Final uncovered COW extent\n", __func__);
- be = NULL;
- goto out_err;
- }
- /* Extents decoded properly, now try to merge them in to
- * existing layout extents.
- */
- spin_lock(&bl->bl_ext_lock);
- list_for_each_entry_safe(be, save, &extents, be_node) {
- list_del(&be->be_node);
- status = bl_add_merge_extent(bl, be);
- if (status) {
- spin_unlock(&bl->bl_ext_lock);
- /* This is a fairly catastrophic error, as the
- * entire layout extent lists are now corrupted.
- * We should have some way to distinguish this.
- */
- be = NULL;
- goto out_err;
- }
- }
- spin_unlock(&bl->bl_ext_lock);
- status = 0;
- out:
- __free_page(scratch);
- dprintk("%s returns %i\n", __func__, status);
- return status;
-
- out_err:
- bl_put_extent(be);
- while (!list_empty(&extents)) {
- be = list_first_entry(&extents, struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- goto out;
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2007 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Fred Isaman <iisaman@umich.edu>
- * Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static void dev_remove(struct net *net, dev_t dev)
-{
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_dev_msg bl_umount_request;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_UMOUNT,
- .totallen = sizeof(bl_umount_request),
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- dprintk("Entering %s\n", __func__);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->len = sizeof(bl_msg) + bl_msg.totallen;
- msg->data = kzalloc(msg->len, GFP_NOFS);
- if (!msg->data)
- goto out;
-
- memset(&bl_umount_request, 0, sizeof(bl_umount_request));
- bl_umount_request.major = MAJOR(dev);
- bl_umount_request.minor = MINOR(dev);
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
- add_wait_queue(&nn->bl_wq, &wq);
- if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
- kfree(msg->data);
-}
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
- dprintk("%s Releasing\n", __func__);
- nfs4_blkdev_put(bdev->bm_mdev);
- dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
- if (bdev) {
- if (bdev->bm_mdev) {
- dprintk("%s Removing DM device: %d:%d\n",
- __func__,
- MAJOR(bdev->bm_mdev->bd_dev),
- MINOR(bdev->bm_mdev->bd_dev));
- nfs4_blk_metadev_release(bdev);
- }
- kfree(bdev);
- }
-}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->bdev)
+ blkdev_put(dev->bdev, FMODE_READ);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs) {
+ dprintk("no signature\n");
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->concat.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk;
+ u32 chunk_idx;
+ u64 disk_offset;
+
+ chunk = div_u64(offset, dev->chunk_size);
+ div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+ if (chunk_idx > dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = div_u64(offset, dev->nr_children);
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+ return PTR_ERR(d->bdev);
+ }
+
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+ if (ret) {
+ bl_free_device(top);
+ kfree(top);
+ goto out_free_volumes;
+ }
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+ return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+ struct rb_node *node = rb_first(root);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_prev(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_next(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+ return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+ struct rb_node *node = root->rb_node;
+ struct pnfs_block_extent *be = NULL;
+
+ while (node) {
+ be = ext_node(node);
+ if (start < be->be_f_offset)
+ node = node->rb_left;
+ else if (start >= ext_f_end(be))
+ node = node->rb_right;
+ else
+ return be;
+ }
+
+ if (be) {
+ if (start < be->be_f_offset)
+ return be;
+
+ if (start >= ext_f_end(be))
+ return ext_tree_next(be);
+ }
+
+ return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+ if (be1->be_state != be2->be_state)
+ return false;
+ if (be1->be_device != be2->be_device)
+ return false;
+
+ if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+ return false;
+
+ if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+ (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+ return false;
+
+ if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+ be1->be_tag != be2->be_tag)
+ return false;
+
+ return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ left->be_length += be->be_length;
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ return left;
+ }
+
+ return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ be->be_length += right->be_length;
+ rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
+ kfree(right);
+ }
+
+ return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+ struct pnfs_block_extent *new, bool merge_ok)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+ struct pnfs_block_extent *be;
+
+ while (*p) {
+ parent = *p;
+ be = ext_node(parent);
+
+ if (new->be_f_offset < be->be_f_offset) {
+ if (merge_ok && ext_can_merge(new, be)) {
+ be->be_f_offset = new->be_f_offset;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset = new->be_v_offset;
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_left(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_left;
+ } else if (new->be_f_offset >= ext_f_end(be)) {
+ if (merge_ok && ext_can_merge(be, new)) {
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_right(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->be_node, parent, p);
+ rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+ struct pnfs_block_extent *be;
+ sector_t len1 = 0, len2 = 0;
+ sector_t orig_v_offset;
+ sector_t orig_len;
+
+ be = __ext_tree_search(root, start);
+ if (!be)
+ return 0;
+ if (be->be_f_offset >= end)
+ return 0;
+
+ orig_v_offset = be->be_v_offset;
+ orig_len = be->be_length;
+
+ if (start > be->be_f_offset)
+ len1 = start - be->be_f_offset;
+ if (ext_f_end(be) > end)
+ len2 = ext_f_end(be) - end;
+
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct pnfs_block_extent *new;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = len1;
+
+ new->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ new->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ new->be_length = len2;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, true);
+ } else {
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ be->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ be->be_length = len2;
+ }
+ } else {
+ if (len1 > 0) {
+ be->be_length = len1;
+ be = ext_tree_next(be);
+ }
+
+ while (be && ext_f_end(be) <= end) {
+ struct pnfs_block_extent *next = ext_tree_next(be);
+
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ be = next;
+ }
+
+ if (be && be->be_f_offset < end) {
+ len1 = ext_f_end(be) - end;
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset += be->be_length - len1;
+ be->be_length = len1;
+ }
+ }
+
+ return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be;
+ struct rb_root *root;
+ int err = 0;
+
+ switch (new->be_state) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ case PNFS_BLOCK_INVALID_DATA:
+ root = &bl->bl_ext_rw;
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ case PNFS_BLOCK_NONE_DATA:
+ root = &bl->bl_ext_ro;
+ break;
+ default:
+ dprintk("invalid extent type\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&bl->bl_ext_lock);
+retry:
+ be = __ext_tree_search(root, new->be_f_offset);
+ if (!be || be->be_f_offset >= ext_f_end(new)) {
+ __ext_tree_insert(root, new, true);
+ } else if (new->be_f_offset >= be->be_f_offset) {
+ if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+ } else {
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+ } else if (ext_f_end(new) <= ext_f_end(be)) {
+ new->be_length = be->be_f_offset - new->be_f_offset;
+ __ext_tree_insert(root, new, true);
+ } else {
+ struct pnfs_block_extent *split;
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+ if (!split) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
+ __ext_tree_insert(root, split, true);
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+ struct pnfs_block_extent *ret)
+{
+ struct rb_node *node;
+ struct pnfs_block_extent *be;
+
+ node = root->rb_node;
+ while (node) {
+ be = ext_node(node);
+ if (isect < be->be_f_offset)
+ node = node->rb_left;
+ else if (isect >= ext_f_end(be))
+ node = node->rb_right;
+ else {
+ *ret = *be;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw)
+{
+ bool found = false;
+
+ spin_lock(&bl->bl_ext_lock);
+ if (!rw)
+ found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+ if (!found)
+ found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+ spin_unlock(&bl->bl_ext_lock);
+
+ return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+ sector_t start, sector_t end)
+{
+ int err, err2;
+
+ spin_lock(&bl->bl_ext_lock);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (rw) {
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ if (!err)
+ err = err2;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+ sector_t split)
+{
+ struct pnfs_block_extent *new;
+ sector_t orig_len = be->be_length;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = split - be->be_f_offset;
+
+ new->be_f_offset = split;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ new->be_v_offset = be->be_v_offset + be->be_length;
+ new->be_length = orig_len - be->be_length;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, false);
+ return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len)
+{
+ struct rb_root *root = &bl->bl_ext_rw;
+ sector_t end = start + len;
+ struct pnfs_block_extent *be;
+ int err = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ /*
+ * First remove all COW extents or holes from written to range.
+ */
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (err)
+ goto out;
+
+ /*
+ * Then mark all invalid extents in the range as written to.
+ */
+ for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+ if (be->be_f_offset >= end)
+ break;
+
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+ continue;
+
+ if (be->be_f_offset < start) {
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ sector_t diff = start - be->be_f_offset;
+
+ left->be_length += diff;
+
+ be->be_f_offset += diff;
+ be->be_v_offset += diff;
+ be->be_length -= diff;
+ } else {
+ err = ext_tree_split(root, be, start);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ext_f_end(be) > end) {
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ sector_t diff = end - be->be_f_offset;
+
+ be->be_length -= diff;
+
+ right->be_f_offset -= diff;
+ right->be_v_offset -= diff;
+ right->be_length += diff;
+ } else {
+ err = ext_tree_split(root, be, end);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+ be->be_tag = EXTENT_WRITTEN;
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+ size_t buffer_size)
+{
+ if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+ int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+ for (i = 0; i < nr_pages; i++)
+ put_page(arg->layoutupdate_pages[i]);
+ kfree(arg->layoutupdate_pages);
+ } else {
+ put_page(arg->layoutupdate_page);
+ }
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count)
+{
+ struct pnfs_block_extent *be;
+ int ret = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (*count * BL_EXTENT_SIZE > buffer_size) {
+ /* keep counting.. */
+ ret = -ENOSPC;
+ continue;
+ }
+
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ size_t count = 0, buffer_size = PAGE_SIZE;
+ __be32 *start_p;
+ int ret;
+
+ dprintk("%s enter\n", __func__);
+
+ arg->layoutupdate_page = alloc_page(GFP_NOFS);
+ if (!arg->layoutupdate_page)
+ return -ENOMEM;
+ start_p = page_address(arg->layoutupdate_page);
+ arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ if (unlikely(ret)) {
+ ext_tree_free_commitdata(arg, buffer_size);
+
+ buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ count = 0;
+
+ arg->layoutupdate_pages =
+ kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+ sizeof(struct page *), GFP_NOFS);
+ if (!arg->layoutupdate_pages)
+ return -ENOMEM;
+
+ start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+ if (!start_p) {
+ kfree(arg->layoutupdate_pages);
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ *start_p = cpu_to_be32(count);
+ arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+ if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+ __be32 *p = start_p;
+ int i = 0;
+
+ for (p = start_p;
+ p < start_p + arg->layoutupdate_len;
+ p += PAGE_SIZE) {
+ arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ }
+ }
+
+ dprintk("%s found %zu ranges\n", __func__, count);
+ return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ struct rb_root *root = &bl->bl_ext_rw;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s status %d\n", __func__, status);
+
+ ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_COMMITTING)
+ continue;
+
+ if (status) {
+ /*
+ * Mark as written and try again.
+ *
+ * XXX: some real error handling here wouldn't hurt..
+ */
+ be->be_tag = EXTENT_WRITTEN;
+ } else {
+ be->be_state = PNFS_BLOCK_READWRITE_DATA;
+ be->be_tag = 0;
+ }
+
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayout.h
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include "blocklayout.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN 1
-#define EXTENT_IN_COMMIT 2
-#define INTERNAL_EXISTS MY_MAX_TAGS
-#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
-
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
- sector_t tmp = s; /* Since do_div modifies its argument */
- return s - sector_div(tmp, base);
-}
-
-static inline sector_t normalize_up(sector_t s, int base)
-{
- return normalize(s + base - 1, base);
-}
-
-/* Complete stub using list while determine API wanted */
-
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu) enter\n", __func__, s);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s)
- return pos->it_tags & INTERNAL_MASK;
- else
- break;
- }
- return -ENOENT;
-}
-
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
- int32_t tags;
-
- dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
- s = normalize(s, tree->mtt_step_size);
- tags = _find_entry(tree, s);
- if ((tags < 0) || !(tags & (1 << tag)))
- return 0;
- else
- return 1;
-}
-
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
- struct pnfs_inval_tracking *storage)
-{
- int found = 0;
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s) {
- found = 1;
- break;
- } else
- break;
- }
- if (found) {
- pos->it_tags |= (1 << tag);
- return 0;
- } else {
- struct pnfs_inval_tracking *new;
- new = storage;
- new->it_sector = s;
- new->it_tags = (1 << tag);
- list_add(&new->it_link, &pos->it_link);
- return 1;
- }
-}
-
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
- u64 i;
-
- dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
- for (i = normalize(s, tree->mtt_step_size); i < s + length;
- i += tree->mtt_step_size)
- if (_add_entry(tree, i, tag, NULL))
- return -ENOMEM;
- return 0;
-}
-
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
- u64 offset, u64 length)
-{
- u64 start, end, s;
- int count, i, used = 0, status = -ENOMEM;
- struct pnfs_inval_tracking **storage;
- struct my_tree *tree = &marks->im_tree;
-
- dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
- start = normalize(offset, tree->mtt_step_size);
- end = normalize_up(offset + length, tree->mtt_step_size);
- count = (int)(end - start) / (int)tree->mtt_step_size;
-
- /* Pre-malloc what memory we might need */
- storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
- if (!storage)
- return -ENOMEM;
- for (i = 0; i < count; i++) {
- storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
- GFP_NOFS);
- if (!storage[i])
- goto out_cleanup;
- }
-
- spin_lock_bh(&marks->im_lock);
- for (s = start; s < end; s += tree->mtt_step_size)
- used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
- spin_unlock_bh(&marks->im_lock);
-
- status = 0;
-
- out_cleanup:
- for (i = used; i < count; i++) {
- if (!storage[i])
- break;
- kfree(storage[i]);
- }
- kfree(storage);
- return status;
-}
-
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
- struct pnfs_inval_tracking *pos;
- u64 expect = 0;
-
- dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector >= end)
- continue;
- if (!expect) {
- if ((pos->it_sector == end - tree->mtt_step_size) &&
- (pos->it_tags & (1 << tag))) {
- expect = pos->it_sector - tree->mtt_step_size;
- if (pos->it_sector < tree->mtt_step_size || expect < start)
- return 1;
- continue;
- } else {
- return 0;
- }
- }
- if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
- return 0;
- expect -= tree->mtt_step_size;
- if (expect < start)
- return 1;
- }
- return 0;
-}
-
-static int is_range_written(struct pnfs_inval_markings *marks,
- sector_t start, sector_t end)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- sector_t start, end;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n",
- __func__, (u64)offset, (u64)length);
-
- start = normalize(offset, marks->im_block_size);
- end = normalize_up(offset + length, marks->im_block_size);
- if (_preload_range(marks, start, end - start))
- goto outerr;
-
- spin_lock_bh(&marks->im_lock);
- if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
- goto out_unlock;
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-
-out_unlock:
- spin_unlock_bh(&marks->im_lock);
-outerr:
- return -ENOMEM;
-}
-
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- int status;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
- (u64)offset, (u64)length);
- spin_lock_bh(&marks->im_lock);
- status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
- spin_unlock_bh(&marks->im_lock);
- return status;
-}
-
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
- dprintk("PRINT SHORT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
- dprintk(" be_length %llu\n", (u64)be->bse_length);
- }
-}
-
-static void print_clist(struct list_head *list, unsigned int count)
-{
- struct pnfs_block_short_extent *be;
- unsigned int i = 0;
-
- ifdebug(FACILITY) {
- printk(KERN_DEBUG "****************\n");
- printk(KERN_DEBUG "Extent list looks like:\n");
- list_for_each_entry(be, list, bse_node) {
- i++;
- print_short_extent(be);
- }
- if (i != count)
- printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
- printk(KERN_DEBUG "****************\n");
- }
-}
-
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
- struct pnfs_block_short_extent *new)
-{
- struct list_head *clist = &bl->bl_commit;
- struct pnfs_block_short_extent *old, *save;
- sector_t end = new->bse_f_offset + new->bse_length;
-
- dprintk("%s enter\n", __func__);
- print_short_extent(new);
- print_clist(clist, bl->bl_count);
- bl->bl_count++;
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe(old, save, clist, bse_node) {
- if (new->bse_f_offset < old->bse_f_offset)
- break;
- if (end <= old->bse_f_offset + old->bse_length) {
- /* Range is already in list */
- bl->bl_count--;
- kfree(new);
- return;
- } else if (new->bse_f_offset <=
- old->bse_f_offset + old->bse_length) {
- /* new overlaps or abuts existing be */
- if (new->bse_mdev == old->bse_mdev) {
- /* extend new to fully replace old */
- new->bse_length += new->bse_f_offset -
- old->bse_f_offset;
- new->bse_f_offset = old->bse_f_offset;
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- }
- /* Note that if we never hit the above break, old will not point to a
- * valid extent. However, in that case &old->bse_node==list.
- */
- list_add_tail(&new->bse_node, &old->bse_node);
- /* Scan forward for overlaps. If we find any, extend new and
- * remove the overlapped extent.
- */
- old = list_prepare_entry(new, clist, bse_node);
- list_for_each_entry_safe_continue(old, save, clist, bse_node) {
- if (end < old->bse_f_offset)
- break;
- /* new overlaps or abuts old */
- if (new->bse_mdev == old->bse_mdev) {
- if (end < old->bse_f_offset + old->bse_length) {
- /* extend new to fully cover old */
- end = old->bse_f_offset + old->bse_length;
- new->bse_length = end - new->bse_f_offset;
- }
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- dprintk("%s: after merging\n", __func__);
- print_clist(clist, bl->bl_count);
-}
-
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new)
-{
- sector_t new_end, end = offset + length;
- struct pnfs_block_layout *bl = container_of(be->be_inval,
- struct pnfs_block_layout,
- bl_inval);
-
- mark_written_sectors(be->be_inval, offset, length);
- /* We want to add the range to commit list, but it must be
- * block-normalized, and verified that the normalized range has
- * been entirely written to disk.
- */
- new->bse_f_offset = offset;
- offset = normalize(offset, bl->bl_blocksize);
- if (offset < new->bse_f_offset) {
- if (is_range_written(be->be_inval, offset, new->bse_f_offset))
- new->bse_f_offset = offset;
- else
- new->bse_f_offset = offset + bl->bl_blocksize;
- }
- new_end = normalize_up(end, bl->bl_blocksize);
- if (end < new_end) {
- if (is_range_written(be->be_inval, end, new_end))
- end = new_end;
- else
- end = new_end - bl->bl_blocksize;
- }
- if (end <= new->bse_f_offset) {
- kfree(new);
- return 0;
- }
- new->bse_length = end - new->bse_f_offset;
- new->bse_devid = be->be_devid;
- new->bse_mdev = be->be_mdev;
-
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, new);
- spin_unlock(&bl->bl_ext_lock);
- return 0;
-}
-
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
- dprintk("PRINT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
- dprintk(" be_length %llu\n", (u64)be->be_length);
- dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
- dprintk(" be_state %d\n", be->be_state);
- }
-}
-
-static void
-destroy_extent(struct kref *kref)
-{
- struct pnfs_block_extent *be;
-
- be = container_of(kref, struct pnfs_block_extent, be_refcnt);
- dprintk("%s be=%p\n", __func__, be);
- kfree(be);
-}
-
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
- if (be) {
- dprintk("%s enter %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_put(&be->be_refcnt, destroy_extent);
- }
-}
-
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
- struct pnfs_block_extent *be;
-
- be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
- if (!be)
- return NULL;
- INIT_LIST_HEAD(&be->be_node);
- kref_init(&be->be_refcnt);
- be->be_inval = NULL;
- return be;
-}
-
-static void print_elist(struct list_head *list)
-{
- struct pnfs_block_extent *be;
- dprintk("****************\n");
- dprintk("Extent list looks like:\n");
- list_for_each_entry(be, list, be_node) {
- print_bl_extent(be);
- }
- dprintk("****************\n");
-}
-
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
- /* Note this assumes new->be_f_offset >= old->be_f_offset */
- return (new->be_state == old->be_state) &&
- ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
- ((new->be_v_offset - old->be_v_offset ==
- new->be_f_offset - old->be_f_offset) &&
- new->be_mdev == old->be_mdev));
-}
-
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set. If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new)
-{
- struct pnfs_block_extent *be, *tmp;
- sector_t end = new->be_f_offset + new->be_length;
- struct list_head *list;
-
- dprintk("%s enter with be=%p\n", __func__, new);
- print_bl_extent(new);
- list = &bl->bl_extents[bl_choose_list(new->be_state)];
- print_elist(list);
-
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
- if (new->be_f_offset >= be->be_f_offset + be->be_length)
- break;
- if (new->be_f_offset >= be->be_f_offset) {
- if (end <= be->be_f_offset + be->be_length) {
- /* new is a subset of existing be*/
- if (extents_consistent(be, new)) {
- dprintk("%s: new is subset, ignoring\n",
- __func__);
- bl_put_extent(new);
- return 0;
- } else {
- goto out_err;
- }
- } else {
- /* |<-- be -->|
- * |<-- new -->| */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- new->be_length += new->be_f_offset -
- be->be_f_offset;
- new->be_f_offset = be->be_f_offset;
- new->be_v_offset = be->be_v_offset;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- } else if (end >= be->be_f_offset + be->be_length) {
- /* new extent overlap existing be */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- } else if (end > be->be_f_offset) {
- /* |<-- be -->|
- *|<-- new -->| */
- if (extents_consistent(new, be)) {
- /* extend new to fully replace be */
- new->be_length += be->be_f_offset + be->be_length -
- new->be_f_offset - new->be_length;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- }
- /* Note that if we never hit the above break, be will not point to a
- * valid extent. However, in that case &be->be_node==list.
- */
- list_add(&new->be_node, &be->be_node);
- dprintk("%s: inserting new\n", __func__);
- print_elist(list);
- /* FIXME - The per-list consistency checks have all been done,
- * should now check cross-list consistency.
- */
- return 0;
-
- out_err:
- bl_put_extent(new);
- return -EIO;
-}
-
-/* Returns extent, or NULL. If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID. Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read)
-{
- struct pnfs_block_extent *be, *cow, *ret;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- cow = ret = NULL;
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- if (!ret)
- ret = be;
- else if (be->be_state != PNFS_BLOCK_READ_DATA)
- bl_put_extent(be);
- else
- cow = be;
- break;
- }
- }
- if (ret &&
- (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
- break;
- }
- spin_unlock(&bl->bl_ext_lock);
- if (cow_read)
- *cow_read = cow;
- print_bl_extent(ret);
- return ret;
-}
-
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
- struct pnfs_block_extent *be, *ret = NULL;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- for (i = 0; i < EXTENT_LISTS; i++) {
- if (ret)
- break;
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- ret = be;
- break;
- }
- }
- }
- print_bl_extent(ret);
- return ret;
-}
-
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
-{
- struct pnfs_block_short_extent *lce, *save;
- unsigned int count = 0;
- __be32 *p, *xdr_start;
-
- dprintk("%s enter\n", __func__);
- /* BUG - creation of bl_commit is buggy - need to wait for
- * entire block to be marked WRITTEN before it can be added.
- */
- spin_lock(&bl->bl_ext_lock);
- /* Want to adjust for possible truncate */
- /* We now want to adjust argument range */
-
- /* XDR encode the ranges found */
- xdr_start = xdr_reserve_space(xdr, 8);
- if (!xdr_start)
- goto out;
- list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
- p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
- if (!p)
- break;
- p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
- p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, 0LL);
- *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
- list_move_tail(&lce->bse_node, &bl->bl_committing);
- bl->bl_count--;
- count++;
- }
- xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
- xdr_start[1] = cpu_to_be32(count);
-out:
- spin_unlock(&bl->bl_ext_lock);
- dprintk("%s found %i ranges\n", __func__, count);
- return 0;
-}
-
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
- struct pnfs_block_extent *orig,
- sector_t offset, sector_t length, int state)
-{
- kref_init(&new->be_refcnt);
- /* don't need to INIT_LIST_HEAD(&new->be_node) */
- memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
- new->be_mdev = orig->be_mdev;
- new->be_f_offset = offset;
- new->be_length = length;
- new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
- new->be_state = state;
- new->be_inval = orig->be_inval;
-}
-
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
- struct pnfs_block_extent *storage)
-{
- struct pnfs_block_extent *prev;
-
- if (!storage)
- goto no_merge;
- if (&be->be_node == head || be->be_node.prev == head)
- goto no_merge;
- prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
- if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
- !extents_consistent(prev, be))
- goto no_merge;
- _prep_new_extent(storage, prev, prev->be_f_offset,
- prev->be_length + be->be_length, prev->be_state);
- list_replace(&prev->be_node, &storage->be_node);
- bl_put_extent(prev);
- list_del(&be->be_node);
- bl_put_extent(be);
- return storage;
-
- no_merge:
- kfree(storage);
- return be;
-}
-
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
- u64 rv = offset + length;
- struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
- struct pnfs_block_extent *children[3];
- struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
- int i = 0, j;
-
- dprintk("%s(%llu, %llu)\n", __func__, offset, length);
- /* Create storage for up to three new extents e1, e2, e3 */
- e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
- e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
- e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
- /* BUG - we are ignoring any failure */
- if (!e1 || !e2 || !e3)
- goto out_nosplit;
-
- spin_lock(&bl->bl_ext_lock);
- be = bl_find_get_extent_locked(bl, offset);
- rv = be->be_f_offset + be->be_length;
- if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
- spin_unlock(&bl->bl_ext_lock);
- goto out_nosplit;
- }
- /* Add e* to children, bumping e*'s krefs */
- if (be->be_f_offset != offset) {
- _prep_new_extent(e1, be, be->be_f_offset,
- offset - be->be_f_offset,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e1;
- print_bl_extent(e1);
- } else
- merge1 = e1;
- _prep_new_extent(e2, be, offset,
- min(length, be->be_f_offset + be->be_length - offset),
- PNFS_BLOCK_READWRITE_DATA);
- children[i++] = e2;
- print_bl_extent(e2);
- if (offset + length < be->be_f_offset + be->be_length) {
- _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
- be->be_f_offset + be->be_length -
- offset - length,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e3;
- print_bl_extent(e3);
- } else
- merge2 = e3;
-
- /* Remove be from list, and insert the e* */
- /* We don't get refs on e*, since this list is the base reference
- * set when init'ed.
- */
- if (i < 3)
- children[i] = NULL;
- new = children[0];
- list_replace(&be->be_node, &new->be_node);
- bl_put_extent(be);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
- for (j = 1; j < i; j++) {
- old = new;
- new = children[j];
- list_add(&new->be_node, &old->be_node);
- }
- if (merge2) {
- /* This is a HACK, should just create a _back_merge function */
- new = list_entry(new->be_node.next,
- struct pnfs_block_extent, be_node);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
- }
- spin_unlock(&bl->bl_ext_lock);
-
- /* Since we removed the base reference above, be is now scheduled for
- * destruction.
- */
- bl_put_extent(be);
- dprintk("%s returns %llu after split\n", __func__, rv);
- return rv;
-
- out_nosplit:
- kfree(e1);
- kfree(e2);
- kfree(e3);
- dprintk("%s returns %llu without splitting\n", __func__, rv);
- return rv;
-}
-
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status)
-{
- struct pnfs_block_short_extent *lce, *save;
-
- dprintk("%s status %d\n", __func__, status);
- list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
- if (likely(!status)) {
- u64 offset = lce->bse_f_offset;
- u64 end = offset + lce->bse_length;
-
- do {
- offset = set_to_rw(bl, offset, end - offset);
- } while (offset < end);
- list_del(&lce->bse_node);
-
- kfree(lce);
- } else {
- list_del(&lce->bse_node);
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, lce);
- spin_unlock(&bl->bl_ext_lock);
- }
- }
-}
-
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *new;
-
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (unlikely(!new))
- return -ENOMEM;
-
- spin_lock_bh(&marks->im_lock);
- list_add(&new->bse_node, &marks->im_extents);
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-}
-
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *rv = NULL;
-
- spin_lock_bh(&marks->im_lock);
- if (!list_empty(&marks->im_extents)) {
- rv = list_entry((&marks->im_extents)->next,
- struct pnfs_block_short_extent, bse_node);
- list_del_init(&rv->bse_node);
- }
- spin_unlock_bh(&marks->im_lock);
-
- return rv;
-}
-
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
- struct pnfs_block_short_extent *se = NULL, *tmp;
-
- if (num_to_free <= 0)
- return;
-
- spin_lock(&marks->im_lock);
- list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- if (--num_to_free == 0)
- break;
- }
- spin_unlock(&marks->im_lock);
-
- BUG_ON(num_to_free > 0);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
+ }
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+ gfp_t gfp_mask)
+{
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ return -EIO;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
+ if (!msg->data)
+ goto out;
+
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out;
+ }
+
+ dev = MKDEV(reply->major, reply->minor);
+out:
+ kfree(msg->data);
+ return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 54de482143cc..b8fb3a4ef649 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->serv = serv;
cb_info->rqst = rqstp;
- cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+ cb_info->task = kthread_create(callback_svc, cb_info->rqst,
"nfsv4.%u-svc", minorversion);
if (IS_ERR(cb_info->task)) {
ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->task = NULL;
return ret;
}
+ rqstp->rq_task = cb_info->task;
+ wake_up_process(cb_info->task);
dprintk("nfs_callback_up: service started\n");
return 0;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto out;
ino = lo->plh_inode;
+
+ spin_lock(&ino->i_lock);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&ino->i_lock);
+
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range))
+ &args->cbl_range)) {
rv = NFS4ERR_DELAY;
- else
- rv = NFS4ERR_NOMATCHING_LAYOUT;
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ goto unlock;
+ }
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+ &args->cbl_range);
+ }
+unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
}
found:
- if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
- dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
- "deleting instead\n", __func__);
nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1c5ff6d58385..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
* set up the iterator to start reading from the server list and return the first item
*/
static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_server_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
@@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
*/
static int nfs_volume_list_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &nfs_server_list_ops,
+ return seq_open_net(inode, file, &nfs_volume_list_ops,
sizeof(struct seq_net_private));
}
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
* set up the iterator to start reading from the volume list and return the first item
*/
static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_volume_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
@@ -1412,24 +1416,18 @@ int nfs_fs_proc_net_init(struct net *net)
p = proc_create("volumes", S_IFREG|S_IRUGO,
nn->proc_nfsfs, &nfs_volume_list_fops);
if (!p)
- goto error_2;
+ goto error_1;
return 0;
-error_2:
- remove_proc_entry("servers", nn->proc_nfsfs);
error_1:
- remove_proc_entry("fs/nfsfs", NULL);
+ remove_proc_subtree("nfsfs", net->proc_net);
error_0:
return -ENOMEM;
}
void nfs_fs_proc_net_exit(struct net *net)
{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- remove_proc_entry("volumes", nn->proc_nfsfs);
- remove_proc_entry("servers", nn->proc_nfsfs);
- remove_proc_entry("fs/nfsfs", NULL);
+ remove_proc_subtree("nfsfs", net->proc_net);
}
/*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
/*
* nfs_direct_cmp_commit_data_verf - compare verifier for commit data
* @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}
-#endif
/**
* nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
return result;
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
- nfs_direct_complete(dreq, true);
-}
-#endif
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..6920127c5eb7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#include "nfstrace.h"
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
unsigned int end = offset + len;
+ if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+ if (!PageUptodate(page))
+ return 1;
+ return 0;
+ }
+
if ((file->f_mode & FMODE_READ) && /* open for read? */
!PageUptodate(page) && /* Uptodate? */
!PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
- * doing this memory reclaim for a fs-related allocation.
+ /* Always try to initiate a 'commit' if relevant, but only
+ * wait for it if __GFP_WAIT is set. Even then, only wait 1
+ * second and only if the 'bdi' is not congested.
+ * Waiting indefinitely can cause deadlocks when the NFS
+ * server is on this machine, when a new TCP connection is
+ * needed and in other rare cases. There is no particular
+ * need to wait extensively here. A short wait has the
+ * benefit that someone else can worry about the freezer.
*/
- if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
- !(current->flags & PF_FSTRANS)) {
- int how = FLUSH_SYNC;
-
- /* Don't let kswapd deadlock waiting for OOM RPC calls */
- if (current_is_kswapd())
- how = 0;
- nfs_commit_inode(mapping->host, how);
+ if (mapping) {
+ struct nfs_server *nfss = NFS_SERVER(mapping->host);
+ nfs_commit_inode(mapping->host, 0);
+ if ((gfp & __GFP_WAIT) &&
+ !bdi_write_congested(&nfss->backing_dev_info)) {
+ wait_on_page_bit_killable_timeout(page, PG_private,
+ HZ);
+ if (PagePrivate(page))
+ set_bdi_congested(&nfss->backing_dev_info,
+ BLK_RW_ASYNC);
+ }
}
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
+ int ret;
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
*span = sis->pages;
- return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+ rcu_read_lock();
+ ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+ rcu_read_unlock();
+
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
{
- xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+ rcu_read_lock();
+ xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+ rcu_read_unlock();
}
#endif
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a27393..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- hdr->res.verf->committed == NFS_FILE_SYNC)
+ hdr->res.verf->committed != NFS_DATA_SYNC)
return;
pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
+ if (data->verf.committed == NFS_UNSTABLE)
+ pnfs_commit_set_layoutcommit(data);
+
return 0;
}
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
- NFS_SERVER(lo->plh_inode)->nfs_client, id);
- if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id,
- lo->plh_lc_cred, gfp_flags);
- if (dsaddr == NULL)
- goto out;
- } else
- dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
/* Found deviceid is unavailable */
if (filelayout_test_devid_unavailable(&dsaddr->id_node))
- goto out_put;
+ goto out_put;
fl->dsaddr = dsaddr;
@@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+ struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
int i;
- for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+ for (i = idx; i < fl_cinfo->nbuckets; i++) {
+ bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
continue;
nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
@@ -1367,6 +1368,17 @@ out:
cinfo->ds->ncommitting = 0;
return PNFS_ATTEMPTED;
}
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
static void
filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1419,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
.free_deviceid_node = filelayout_free_deveiceid_node,
};
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
}
/* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
int i;
u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
- nfs4_init_deviceid_node(&dsaddr->id_node,
- NFS_SERVER(ino)->pnfs_curr_ld,
- NFS_SERVER(ino)->nfs_client,
- &pdev->dev_id);
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
INIT_LIST_HEAD(&dsaddrs);
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+ da = decode_ds_addr(server->nfs_client->cl_net,
&stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
return NULL;
}
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct nfs4_file_layout_dsaddr *n, *new;
-
- new = decode_device(inode, dev, gfp_flags);
- if (!new) {
- printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
- __func__);
- return NULL;
- }
-
- d = nfs4_insert_deviceid_node(&new->id_node);
- n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
- if (n != new) {
- nfs4_fl_free_deviceid(new);
- return n;
- }
-
- return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
- struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred,
- gfp_t gfp_flags)
-{
- struct pnfs_device *pdev = NULL;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- struct nfs4_file_layout_dsaddr *dsaddr = NULL;
- int rc, i;
- struct nfs_server *server = NFS_SERVER(inode);
-
- /*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
- */
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
- __func__, inode, max_resp_sz, max_pages);
-
- pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
- if (pdev == NULL)
- return NULL;
-
- pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
- if (pages == NULL) {
- kfree(pdev);
- return NULL;
- }
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(gfp_flags);
- if (!pages[i])
- goto out_free;
- }
-
- memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
- pdev->layout_type = LAYOUT_NFSV4_1_FILES;
- pdev->pages = pages;
- pdev->pgbase = 0;
- pdev->pglen = max_resp_sz;
- pdev->mincount = 0;
- pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc)
- goto out_free;
-
- /*
- * Found new device, need to decode it and then add it to the
- * list of known devices for this mountpoint.
- */
- dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(pdev);
- dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
- return dsaddr;
-}
-
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
struct nfs_server_key *key = buffer;
uint16_t len = sizeof(struct nfs_server_key);
+ memset(key, 0, len);
key->nfsversion = clp->rpc_ops->version;
key->family = clp->cl_addr.ss_family;
- memset(key, 0, len);
-
switch (clp->cl_addr.ss_family) {
case AF_INET:
key->port = sin->sin_port;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+ BUG_ON(!S_ISREG(inode->i_mode));
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9056622d2230..14ae6f20a172 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
#endif
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
- struct nfs_fattr *, rpc_authflavor_t);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, rpc_authflavor_t);
+
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d0fec260132a..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
#include <linux/nfsacl.h>
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
.rpc_argp = &args,
.rpc_resp = &fattr,
};
- int status;
+ int status = 0;
+
+ if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+ goto out;
status = -EOPNOTSUPP;
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "nfs3_fs.h"
#ifdef CONFIG_NFS_V3_ACL
static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
#include "iostat.h"
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
#include <linux/module.h>
#include <linux/nfs_fs.h>
#include "internal.h"
+#include "nfs3_fs.h"
#include "nfs.h"
static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
*/
struct nfs4_lock_state {
- struct list_head ls_locks; /* Other lock stateids */
- struct nfs4_state * ls_state; /* Pointer to open state */
+ struct list_head ls_locks; /* Other lock stateids */
+ struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
- unsigned long ls_flags;
+ unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
- nfs4_stateid ls_stateid;
- atomic_t ls_count;
- fl_owner_t ls_owner;
- struct work_struct ls_release;
+ nfs4_stateid ls_stateid;
+ atomic_t ls_count;
+ fl_owner_t ls_owner;
};
/* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 53e435a95260..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos" */
if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (pos->cl_clientid != new->cl_clientid)
continue;
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos", especially the client
* ID and serverowner fields. Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (!nfs4_match_clientids(pos, new))
continue;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75ae8d22f067..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static long nfs4_update_delay(long *timeout)
+{
+ long ret;
+ if (!timeout)
+ return NFS4_POLL_RETRY_MAX;
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ ret = *timeout;
+ *timeout <<= 1;
+ return ret;
+}
+
static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
{
int res = 0;
might_sleep();
- if (*timeout <= 0)
- *timeout = NFS4_POLL_RETRY_MIN;
- if (*timeout > NFS4_POLL_RETRY_MAX)
- *timeout = NFS4_POLL_RETRY_MAX;
- freezable_schedule_timeout_killable_unsafe(*timeout);
+ freezable_schedule_timeout_killable_unsafe(
+ nfs4_update_delay(timeout));
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
- *timeout <<= 1;
return res;
}
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
int ret = -EAGAIN;
for (;;) {
+ spin_lock(&state->owner->so_lock);
if (can_open_cached(state, fmode, open_mode)) {
- spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, fmode, open_mode)) {
- update_open_stateflags(state, fmode);
- spin_unlock(&state->owner->so_lock);
- goto out_return_state;
- }
+ update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
+ goto out_return_state;
}
+ spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
if (!can_open_delegated(delegation, fmode)) {
@@ -2226,9 +2234,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
ret = _nfs4_proc_open(opendata);
if (ret != 0) {
if (ret == -ENOENT) {
- d_drop(opendata->dentry);
- d_add(opendata->dentry, NULL);
- nfs_set_verifier(opendata->dentry,
+ dentry = opendata->dentry;
+ if (dentry->d_inode)
+ d_delete(dentry);
+ else if (d_unhashed(dentry))
+ d_add(dentry, NULL);
+
+ nfs_set_verifier(dentry,
nfs_save_change_attribute(opendata->dir->d_inode));
}
goto out;
@@ -2560,6 +2572,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct nfs_server *server = NFS_SERVER(calldata->inode);
+ nfs4_stateid *res_stateid = NULL;
dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2570,12 +2583,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
*/
switch (task->tk_status) {
case 0:
- if (calldata->roc)
+ res_stateid = &calldata->res.stateid;
+ if (calldata->arg.fmode == 0 && calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
- nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
- goto out_release;
+ break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
@@ -2584,12 +2597,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0)
break;
default:
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
goto out_release;
}
}
- nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2601,6 +2614,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct inode *inode = calldata->inode;
+ bool is_rdonly, is_wronly, is_rdwr;
int call_close = 0;
dprintk("%s: begin!\n", __func__);
@@ -2608,21 +2622,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_wait;
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
spin_lock(&state->owner->so_lock);
+ is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+ is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
/* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
- if (state->n_rdonly == 0) {
- call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_READ;
- }
- if (state->n_wronly == 0) {
- call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_WRITE;
- }
- }
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ if (calldata->arg.fmode == 0)
+ call_close |= is_rdwr;
+
if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -3205,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_label *label = NULL;
int status;
- if (pnfs_ld_layoutret_on_setattr(inode))
+ if (pnfs_ld_layoutret_on_setattr(inode) &&
+ sattr->ia_valid & ATTR_SIZE &&
+ sattr->ia_size < i_size_read(inode))
pnfs_commit_and_return_layout(inode);
nfs_fattr_init(fattr);
@@ -3564,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL,
+ &data->timeout) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
return 1;
@@ -3597,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
return 0;
update_changeattr(old_dir, &res->old_cinfo);
@@ -4101,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
trace_nfs4_read(hdr, task->tk_status);
if (nfs4_async_handle_error(task, server,
- hdr->args.context->state) == -EAGAIN) {
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -4169,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
struct inode *inode = hdr->inode;
-
+
trace_nfs4_write(hdr, task->tk_status);
if (nfs4_async_handle_error(task, NFS_SERVER(inode),
- hdr->args.context->state) == -EAGAIN) {
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -4252,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
struct inode *inode = data->inode;
trace_nfs4_commit(data, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -4805,7 +4831,8 @@ out:
static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
{
struct nfs_client *clp = server->nfs_client;
@@ -4855,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
#endif /* CONFIG_NFS_V4_1 */
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
+ rpc_delay(task, nfs4_update_delay(timeout));
+ goto restart_call;
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5095,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
pnfs_roc_set_barrier(data->inode, data->roc_barrier);
break;
default:
- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
- -EAGAIN) {
+ if (nfs4_async_handle_error(task, data->res.server,
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -5360,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
case -NFS4ERR_EXPIRED:
break;
default:
- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, calldata->server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
nfs_release_seqid(calldata->arg.seqid);
@@ -5966,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
break;
case -NFS4ERR_LEASE_MOVED:
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
@@ -7341,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
int ret = 0;
if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
- return 0;
+ return -EAGAIN;
task = _nfs41_proc_sequence(clp, cred, false);
if (IS_ERR(task))
ret = PTR_ERR(task);
@@ -7571,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
} else {
LIST_HEAD(head);
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
spin_unlock(&inode->i_lock);
- /* Mark the bad layout state as invalid, then
- * retry using the open stateid. */
pnfs_free_lseg_list(&head);
+
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
}
}
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
out:
dprintk("<-- %s\n", __func__);
@@ -7738,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
case 0:
break;
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+ if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
rpc_restart_call_prepare(task);
return;
@@ -7797,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
return status;
}
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_getdevicelist_args args = {
- .fh = fh,
- .layoutclass = server->pnfs_curr_ld->id,
- };
- struct nfs4_getdevicelist_res res = {
- .devlist = devlist,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
- .rpc_argp = &args,
- .rpc_resp = &res,
- };
- int status;
-
- dprintk("--> %s\n", __func__);
- status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
- &res.seq_res, 0);
- dprintk("<-- %s status=%d\n", __func__, status);
- return status;
-}
-
-int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_exception exception = { };
- int err;
-
- do {
- err = nfs4_handle_exception(server,
- _nfs4_getdevicelist(server, fh, devlist),
- &exception);
- } while (exception.retry);
-
- dprintk("%s: err=%d, num_devs=%u\n", __func__,
- err, devlist->num_devs);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *pdev,
@@ -7917,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case 0:
break;
default:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -8213,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
}
nfs_expire_all_delegations(clp);
} else {
+ int ret;
+
/* Queue an asynchronous RENEW. */
- ops->sched_state_renewal(clp, cred, renew_flags);
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
put_rpccred(cred);
- goto out_exp;
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
}
} else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
return NULL;
}
-static void
-free_lock_state_work(struct work_struct *work)
-{
- struct nfs4_lock_state *lsp = container_of(work,
- struct nfs4_lock_state, ls_release);
- struct nfs4_state *state = lsp->ls_state;
- struct nfs_server *server = state->owner->so_server;
- struct nfs_client *clp = server->nfs_client;
-
- clp->cl_mvops->free_lock_state(server, lsp);
-}
-
/*
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
- INIT_WORK(&lsp->ls_release, free_lock_state_work);
return lsp;
out_free:
kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
- queue_work(nfsiod_workqueue, &lsp->ls_release);
- else {
- server = state->owner->so_server;
+ server = state->owner->so_server;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+ } else
nfs4_free_lock_state(server, lsp);
- }
}
static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -1717,7 +1705,8 @@ restart:
if (status < 0) {
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
- return nfs4_recovery_handle_error(clp, status);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
}
nfs4_put_state_owner(sp);
@@ -1726,7 +1715,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
- return status;
+ return 0;
}
static int nfs4_check_lease(struct nfs_client *clp)
@@ -1773,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
break;
case -NFS4ERR_STALE_CLIENTID:
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
break;
case -NFS4ERR_CLID_INUSE:
@@ -2357,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
status = nfs4_check_lease(clp);
if (status < 0)
goto out_error;
+ continue;
}
if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2378,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim reboot";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->reboot_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- continue;
- nfs4_state_end_reclaim_reboot(clp);
- if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
}
/* Now recover expired state... */
@@ -2393,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim nograce";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->nograce_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
- test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
- encode_verifier_maxsz)
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
- 2 /* nfs_cookie4 gdlr_cookie */ + \
- decode_verifier_maxsz \
- /* verifier4 gdlr_verifier */ + \
- 1 /* gdlr_deviceid_list count */ + \
- XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
- NFS4_DEVICEID4_SIZE) \
- /* gdlr_deviceid_list */ + \
- 1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
- XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
1 /* layout type */ + \
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap */)
+ 1 /* notification bitmap, word 0 */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
2 /* last byte written */ + \
1 /* nt_timechanged (false) */ + \
1 /* layoutupdate4 layout type */ + \
- 1 /* NULL filelayout layoutupdate4 payload */)
+ 1 /* layoutupdate4 opaqueue len */)
+ /* the actual content of layoutupdate4 should
+ be allocated by drivers and spliced in
+ using xdr_write_pages */
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
- encode_sequence_maxsz + \
- encode_putfh_maxsz + \
- encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
- decode_sequence_maxsz + \
- decode_putfh_maxsz + \
- decode_getdevicelist_maxsz)
#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz +\
encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
#ifdef CONFIG_NFS_V4_1
static void
-encode_getdevicelist(struct xdr_stream *xdr,
- const struct nfs4_getdevicelist_args *args,
- struct compound_hdr *hdr)
-{
- __be32 *p;
- nfs4_verifier dummy = {
- .data = "dummmmmy",
- };
-
- encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(args->layoutclass);
- *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
- xdr_encode_hyper(p, 0ULL); /* cookie */
- encode_nfs4_verifier(xdr, &dummy);
-}
-
-static void
encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfs4_getdeviceinfo_args *args,
struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
__be32 *p;
encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
- p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
*p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
- *p++ = cpu_to_be32(0); /* bitmap length 0 */
+
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
}
static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int
encode_layoutcommit(struct xdr_stream *xdr,
struct inode *inode,
- const struct nfs4_layoutcommit_args *args,
+ struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
NFS_I(inode)->layout, xdr, args);
- else
- encode_uint32(xdr, 0); /* no layout-type payload */
+ } else {
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages) {
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
+ }
+ }
return 0;
}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
}
/*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_args *args)
-{
- struct compound_hdr hdr = {
- .minorversion = nfs4_xdr_minorversion(&args->seq_args),
- };
-
- encode_compound_hdr(xdr, req, &hdr);
- encode_sequence(xdr, &args->seq_args, &hdr);
- encode_putfh(xdr, args->fh, &hdr);
- encode_getdevicelist(xdr, args, &hdr);
- encode_nops(&hdr);
-}
-
-/*
* Encode GETDEVICEINFO request
*/
static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
- struct pnfs_devicelist *res)
-{
- __be32 *p;
- int status, i;
- nfs4_verifier verftemp;
-
- status = decode_op_hdr(xdr, OP_GETDEVICELIST);
- if (status)
- return status;
-
- p = xdr_inline_decode(xdr, 8 + 8 + 4);
- if (unlikely(!p))
- goto out_overflow;
-
- /* TODO: Skip cookie for now */
- p += 2;
-
- /* Read verifier */
- p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-
- res->num_devs = be32_to_cpup(p);
-
- dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-
- if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
- printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
- __func__, res->num_devs);
- return -EIO;
- }
-
- p = xdr_inline_decode(xdr,
- res->num_devs * NFS4_DEVICEID4_SIZE + 4);
- if (unlikely(!p))
- goto out_overflow;
- for (i = 0; i < res->num_devs; i++)
- p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
- NFS4_DEVICEID4_SIZE);
- res->eof = be32_to_cpup(p);
- return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-}
-
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct pnfs_device *pdev)
{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
goto out_overflow;
- for (i = 0; i < len; i++, p++) {
- if (be32_to_cpup(p)) {
- dprintk("%s: notifications not supported\n",
+
+ if (be32_to_cpup(p++) &
+ ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+ dprintk("%s: unsupported notification\n",
+ __func__);
+ }
+
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
__func__);
return -EIO;
}
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
}
/*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_res *res)
-{
- struct compound_hdr hdr;
- int status;
-
- dprintk("encoding getdevicelist!\n");
-
- status = decode_compound_hdr(xdr, &hdr);
- if (status != 0)
- goto out;
- status = decode_sequence(xdr, &res->seq_res, rqstp);
- if (status != 0)
- goto out;
- status = decode_putfh(xdr);
- if (status != 0)
- goto out;
- status = decode_getdevicelist(xdr, res->devlist);
-out:
- return status;
-}
-
-/*
* Decode GETDEVINFO response
*/
static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
- PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
PROC(BIND_CONN_TO_SESSION,
enc_bind_conn_to_session, dec_bind_conn_to_session),
PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
kfree(de);
}
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de;
-
- d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
- if (!d)
- return NULL;
-
- de = container_of(d, struct objio_dev_ent, id_node);
- return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id, struct osd_dev *od,
- gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
- struct objio_dev_ent *n;
-
- if (!de) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- return NULL;
- }
-
- dprintk("%s: Adding od=%p\n", __func__, od);
- nfs4_init_deviceid_node(&de->id_node,
- nfss->pnfs_curr_ld,
- nfss->nfs_client,
- d_id);
- de->od.od = od;
-
- d = nfs4_insert_deviceid_node(&de->id_node);
- n = container_of(d, struct objio_dev_ent, id_node);
- if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
- objio_free_deviceid_node(&de->id_node);
- de = n;
- }
-
- return de;
-}
-
struct objio_segment {
struct pnfs_layout_segment lseg;
@@ -130,29 +84,24 @@ struct objio_state {
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
- gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode;
+ struct objio_dev_ent *ode = NULL;
struct osd_dev *od;
struct osd_dev_info odi;
bool retry_flag = true;
+ __be32 *p;
int err;
- ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode) {
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
- return 0;
- }
+ deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+ if (!deviceaddr)
+ return NULL;
- err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
- if (unlikely(err)) {
- dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
- __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return err;
- }
+ p = page_address(pdev->pages[0]);
+ pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
goto out;
}
- ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
- gfp_flags);
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(d_id), _DEVID_HI(d_id));
+ _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+ ode = kzalloc(sizeof(*ode), gfp_flags);
+ if (!ode) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ goto out;
+ }
+
+ nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+ kfree(deviceaddr);
+
+ ode->od.od = od;
+ return &ode->id_node;
+
out:
- objlayout_put_deviceinfo(deviceaddr);
- return err;
+ kfree(deviceaddr);
+ return NULL;
}
static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct xdr_stream *xdr,
gfp_t gfp_flags)
{
+ struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
objio_seg->oc.first_dev = layout.olo_comps_index;
cur_comp = 0;
while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *ode;
+
copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
- err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
- &src_comp.oc_object_id.oid_device_id,
- gfp_flags);
- if (err)
+
+ d = nfs4_find_get_deviceid(server,
+ &src_comp.oc_object_id.oid_device_id,
+ pnfslay->plh_lc_cred, gfp_flags);
+ if (!d) {
+ err = -ENXIO;
goto err;
- ++cur_comp;
+ }
+
+ ode = container_of(d, struct objio_dev_ent, id_node);
+ objio_seg->oc.ods[cur_comp++] = &ode->od;
}
/* pnfs_osd_xdr_decode_layout_comp returns false on error */
if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_LAYOUTRET_ON_ERROR,
+ .max_deviceinfo_size = PAGE_SIZE,
.owner = THIS_MODULE,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
dprintk("%s: Return\n", __func__);
}
-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
- struct page *page;
- struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags)
-{
- struct objlayout_deviceinfo *odi;
- struct pnfs_device pd;
- struct page *page, **pages;
- u32 *p;
- int err;
-
- page = alloc_page(gfp_flags);
- if (!page)
- return -ENOMEM;
-
- pages = &page;
- pd.pages = pages;
-
- memcpy(&pd.dev_id, d_id, sizeof(*d_id));
- pd.layout_type = LAYOUT_OSD2_OBJECTS;
- pd.pages = &page;
- pd.pgbase = 0;
- pd.pglen = PAGE_SIZE;
- pd.mincount = 0;
- pd.maxcount = PAGE_SIZE;
-
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
- pnfslay->plh_lc_cred);
- dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
- if (err)
- goto err_out;
-
- p = page_address(page);
- odi = kzalloc(sizeof(*odi), gfp_flags);
- if (!odi) {
- err = -ENOMEM;
- goto err_out;
- }
- pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
- odi->page = page;
- *deviceaddr = &odi->da;
- return 0;
-
-err_out:
- __free_page(page);
- return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- struct objlayout_deviceinfo *odi = container_of(deviceaddr,
- struct objlayout_deviceinfo,
- da);
-
- __free_page(odi->page);
- kfree(odi);
-}
-
enum {
OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
/*
* exported generic objects function vectors
*/
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index be7cbce6e4c7..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
return 0;
}
+ /*
+ * Limit the request size so that we can still allocate a page array
+ * for it without upsetting the slab allocator.
+ */
+ if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ sizeof(struct page) > PAGE_SIZE)
+ return 0;
+
return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
dprintk("%s freeing layout for inode %lu\n", __func__,
lo->plh_inode->i_ino);
inode = lo->plh_inode;
+
+ pnfs_layoutcommit_inode(inode, false);
+
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0;
}
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new,
- struct list_head *free_me_list)
-{
- if (nfs4_stateid_match_other(&lo->plh_stateid, new))
- return;
- /* Layout is new! Kill existing layout segments */
- pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
- } else if (list_empty(&lo->plh_segs)) {
+ } else if (list_empty(&lo->plh_segs) ||
+ test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
int seq;
do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
empty = list_empty(&lo->plh_segs);
pnfs_clear_layoutcommit(ino, &tmp_list);
pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+ }
+
/* Don't send a LAYOUTRETURN if list was initially empty */
if (empty) {
spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout segments to return\n", __func__);
goto out;
}
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
lo->plh_block_lgets++;
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out;
}
+ init_lseg(lo, lseg);
+ lseg->pls_range = res->range;
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
dprintk("%s forget reply due to recall\n", __func__);
goto out_forget_reply;
}
- if (pnfs_layoutgets_blocked(lo, 1) ||
- pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (pnfs_layoutgets_blocked(lo, 1)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
- /* Check that the new stateid matches the old stateid */
- pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
- /* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ /* existing state ID, make sure the sequence number matches. */
+ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ dprintk("%s forget reply due to sequence\n", __func__);
+ goto out_forget_reply;
+ }
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
+ } else {
+ /*
+ * We got an entirely new state ID. Mark all segments for the
+ * inode invalid, and don't bother validating the stateid
+ * sequence number.
+ */
+ pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+ nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+ lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+ }
+
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg);
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
}
EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+ struct inode *inode = data->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool mark_as_dirty = false;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ mark_as_dirty = true;
+ dprintk("%s: Set layoutcommit for inode %lu ",
+ __func__, inode->i_ino);
+ }
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ pnfs_get_lseg(data->lseg);
+ }
+ if (data->lwb > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = data->lwb;
+ spin_unlock(&inode->i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, data->lseg, nfsi->layout->plh_lwb);
+
+ /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+ * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+ if (mark_as_dirty)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
{
struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
int
pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
data->args.lastbytewritten = end_pos - 1;
data->res.server = NFS_SERVER(inode);
+ if (ld->prepare_layoutcommit) {
+ status = ld->prepare_layoutcommit(&data->args);
+ if (status) {
+ spin_lock(&inode->i_lock);
+ if (end_pos < nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ spin_unlock(&inode->i_lock);
+ put_rpccred(data->cred);
+ set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+ goto clear_layoutcommitting;
+ }
+ }
+
+
status = nfs4_proc_layoutcommit(data, sync);
out:
if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
+ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
};
enum layoutdriver_policy_flags {
- /* Should the pNFS client commit and return the layout upon a setattr */
+ /* Should the pNFS client commit and return the layout upon truncate to
+ * a smaller size */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
+ PNFS_READ_WHOLE_PAGE = 1 << 2,
};
struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
+ unsigned max_deviceinfo_size;
int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*return_range) (struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range);
+
/* test for nfs page cache coalescing */
const struct nfs_pageio_ops *pg_read_ops;
const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
-
- void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
};
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
/* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
atomic_t ref;
};
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
- const struct pnfs_layoutdriver_type *,
- const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
*/
#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
return NULL;
}
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ if (server->pnfs_curr_ld->max_deviceinfo_size &&
+ server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+ max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+
+out_free_pages:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
/*
* Lookup a deviceid in cache and get a reference count on it if found
*
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
* @id deviceid to look up
*/
static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id,
- long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
{
struct nfs4_deviceid_node *d;
rcu_read_lock();
- d = _lookup_deviceid(ld, clp, id, hash);
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
if (d != NULL)
atomic_inc(&d->ref);
rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
}
struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask)
{
- return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ return d;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new)
+ return new;
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ return d;
+ }
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ atomic_inc(&new->ref);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
}
EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
- const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
const struct nfs4_deviceid *id)
{
INIT_HLIST_NODE(&d->node);
INIT_HLIST_NODE(&d->tmpnode);
- d->ld = ld;
- d->nfs_client = nfs_client;
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
d->flags = 0;
d->deviceid = *id;
atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
/*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- * Note that the caller must set up the following members:
- * new->ld
- * new->nfs_client
- * new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
- struct nfs4_deviceid_node *d;
- long hash;
-
- spin_lock(&nfs4_deviceid_lock);
- hash = nfs4_deviceid_hash(&new->deviceid);
- d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
- if (d) {
- spin_unlock(&nfs4_deviceid_lock);
- return d;
- }
-
- hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
- spin_unlock(&nfs4_deviceid_lock);
- atomic_inc(&new->ref);
-
- return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
-/*
* Dereference a deviceid node and delete it when its reference count drops
* to zero.
*
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
}
rcu_read_unlock();
}
-
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
return NFS_TEXT_DATA;
}
-#if !IS_ENABLED(CONFIG_NFS_V3)
- if (args->version == 3)
- goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
-
return 0;
out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
return -EINVAL;
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
- dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
- return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
-
out_nomem:
dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 175d5d073ccf..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_clear_request_commit(struct nfs_page *req);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
}
/*
- * nfs_page_search_commits_for_head_request_locked
- *
- * Search through commit lists on @inode for the head request for @page.
- * Must be called while holding the inode (which is cinfo) lock.
- *
- * Returns the head request if found, or NULL if not found.
- */
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
- struct page *page)
-{
- struct nfs_page *freq, *t;
- struct nfs_commit_info cinfo;
- struct inode *inode = &nfsi->vfs_inode;
-
- nfs_init_cinfo_from_inode(&cinfo, inode);
-
- /* search through pnfs commit lists */
- freq = pnfs_search_commit_reqs(inode, &cinfo, page);
- if (freq)
- return freq->wb_head;
-
- /* Linearly search the commit list for the correct request */
- list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
-
- return NULL;
-}
-
-/*
* nfs_page_find_head_request_locked - find head request associated with @page
*
* must be called while holding the inode lock.
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
static int wb_priority(struct writeback_control *wbc)
{
+ int ret = 0;
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = FLUSH_COND_STABLE;
if (wbc->for_kupdate || wbc->for_background)
- return FLUSH_LOWPRI | FLUSH_COND_STABLE;
- return FLUSH_COND_STABLE;
+ ret |= FLUSH_LOWPRI;
+ return ret;
}
/*
@@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (likely(!PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
+ smp_mb__after_atomic();
+ wake_up_page(head->wb_page, PG_private);
clear_bit(PG_MAPPED, &head->wb_flags);
}
nfsi->npages--;
@@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct nfs_commit_info cinfo;
+ struct inode *inode = &nfsi->vfs_inode;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ /* search through pnfs commit lists */
+ freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+ if (freq)
+ return freq->wb_head;
+
+ /* Linearly search the commit list for the correct request */
+ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+
+ return NULL;
+}
+
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
@@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
return hdr->verf.committed != NFS_FILE_SYNC;
}
-#else
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
- struct inode *inode)
-{
-}
-
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
- struct inode *inode,
- struct nfs_direct_req *dreq)
-{
-}
-
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
-{
-}
-
-static void
-nfs_clear_request_commit(struct nfs_page *req)
-{
-}
-
-int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-{
- return 0;
-}
-
-#endif
-
static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
@@ -932,7 +909,6 @@ out:
hdr->release(hdr);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
@@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
return ret;
}
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
- struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-#endif
-
/*
* Search for an existing write request, and attempt to update
* it to reflect a new dirty region on a given page.
@@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
return status;
nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
if (hdr->res.verf->committed < hdr->args.stable &&
task->tk_status >= 0) {
/* We tried a write call, but the server did not
@@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
complain = jiffies + 300 * HZ;
}
}
-#endif
/* Deal with the suid/sgid bit corner case */
if (nfs_should_remove_suid(inode))
@@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
{
int ret;
@@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
}
EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+ loff_t lwb = 0;
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list)
+ if (lwb < (req_offset(req) + req->wb_bytes))
+ lwb = req_offset(req) + req->wb_bytes;
+
+ return lwb;
+}
+
/*
* Set up the argument/result storage required for the RPC call.
*/
@@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
data->inode = inode;
data->cred = first->wb_context->cred;
data->lseg = lseg; /* reference transferred */
+ /* only set lwb for pnfs commit */
+ if (lseg)
+ data->lwb = nfs_get_lwb(&data->pages);
data->mds_ops = &nfs_commit_ops;
data->completion_ops = cinfo->completion_ops;
data->dreq = cinfo->dreq;
@@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
+ struct nfs_server *nfss;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
@@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
next:
nfs_unlock_and_release_request(req);
}
+ nfss = NFS_SERVER(data->inode);
+ if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1778,12 +1758,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
- return 0;
-}
-#endif
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
-
nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
/*
* Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
*/
#include <linux/module.h>
-#include <linux/lockd/bind.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
-#include "netns.h"
-
+static int grace_net_id;
static DEFINE_SPINLOCK(grace_lock);
/**
* locks_start_grace
+ * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
*
* This function is called to start a grace period.
*/
-void locks_start_grace(struct net *net, struct lock_manager *lm)
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct list_head *grace_list = net_generic(net, grace_net_id);
spin_lock(&grace_lock);
- list_add(&lm->list, &ln->grace_list);
+ list_add(&lm->list, grace_list);
spin_unlock(&grace_lock);
}
EXPORT_SYMBOL_GPL(locks_start_grace);
/**
* locks_end_grace
+ * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
* Note that callers count on it being safe to call this more than once,
* and the second call should be a no-op.
*/
-void locks_end_grace(struct lock_manager *lm)
+void
+locks_end_grace(struct lock_manager *lm)
{
spin_lock(&grace_lock);
list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
* to answer ordinary lock requests, and when they should accept only
* lock reclaims.
*/
-int locks_in_grace(struct net *net)
+int
+locks_in_grace(struct net *net)
{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct list_head *grace_list = net_generic(net, grace_net_id);
- return !list_empty(&ln->grace_list);
+ return !list_empty(grace_list);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ INIT_LIST_HEAD(grace_list);
+ return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ BUG_ON(!list_empty(grace_list));
+}
+
+static struct pernet_operations grace_net_ops = {
+ .init = grace_init_net,
+ .exit = grace_exit_net,
+ .id = &grace_net_id,
+ .size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+ return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+ unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
+ select GRACE_PERIOD
help
This option enables support in your system's NFS server for
version 4 of the NFS protocol (RFC 3530).
@@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL
If you do not wish to enable fine-grained security labels SELinux or
Smack policies on NFSv4 files, say N.
- WARNING: there is still a chance of backwards-incompatible protocol changes.
- For now we recommend "Y" only for developers and testers.
-
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct hlist_node c_hash;
struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 72ffd7cce3c3..30a739d896ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1145,6 +1145,7 @@ static struct flags {
{ NFSEXP_ALLSQUASH, {"all_squash", ""}},
{ NFSEXP_ASYNC, {"async", "sync"}},
{ NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+ { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
{ NFSEXP_NOHIDE, {"nohide", ""}},
{ NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index fa2525b2e9d7..12f2aab4f614 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -223,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
attr = &argp->attrs;
- /* Get the directory inode */
- nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
- if (nfserr)
- RETURN_STATUS(nfserr);
-
/* Unfudge the mode bits */
attr->ia_mode &= ~S_IFMT;
if (!(attr->ia_valid & ATTR_MODE)) {
@@ -471,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
resp->buflen = resp->count;
resp->rqstp = rqstp;
offset = argp->cookie;
+
+ nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+ if (nfserr)
+ RETURN_STATUS(nfserr);
+
+ if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
+ RETURN_STATUS(nfserr_notsupp);
+
nfserr = nfsd_readdir(rqstp, &resp->fh,
&offset,
&resp->common,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e0be57b0f79b..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
/* Index of predefined Linux callback client operations */
-enum {
- NFSPROC4_CLNT_CB_NULL = 0,
- NFSPROC4_CLNT_CB_RECALL,
- NFSPROC4_CLNT_CB_SEQUENCE,
-};
-
struct nfs4_cb_compound_hdr {
/* args */
u32 ident; /* minorversion 0 only */
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct nfsd4_callback *cb)
{
- const struct nfs4_delegation *args = cb->cb_op;
+ const struct nfs4_delegation *dp = cb_to_delegation(cb);
struct nfs4_cb_compound_hdr hdr = {
.ident = cb->cb_clp->cl_cb_ident,
.minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_cb_compound4args(xdr, &hdr);
encode_cb_sequence4args(xdr, cb, &hdr);
- encode_cb_recall4args(xdr, args, &hdr);
+ encode_cb_recall4args(xdr, dp, &hdr);
encode_cb_nops(&hdr);
}
@@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
static struct workqueue_struct *callback_wq;
-static void run_nfsd4_cb(struct nfsd4_callback *cb)
-{
- queue_work(callback_wq, &cb->cb_work);
-}
-
-static void do_probe_callback(struct nfs4_client *clp)
-{
- struct nfsd4_callback *cb = &clp->cl_cb_null;
-
- cb->cb_op = NULL;
- cb->cb_clp = clp;
-
- cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
- cb->cb_msg.rpc_argp = NULL;
- cb->cb_msg.rpc_resp = NULL;
-
- cb->cb_ops = &nfsd4_cb_probe_ops;
-
- run_nfsd4_cb(cb);
-}
-
/*
* Poke the callback thread to process any updates to the callback
* parameters, and send a null probe.
@@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
{
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
- do_probe_callback(clp);
+ nfsd4_run_cb(&clp->cl_cb_null);
}
void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
clp->cl_cb_session->se_cb_seq_nr);
-
- /* We're done looking into the sequence information */
- task->tk_msg.rpc_resp = NULL;
}
-}
-
-
-static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-{
- struct nfsd4_callback *cb = calldata;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = cb->cb_clp;
- struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
-
- nfsd4_cb_done(task, calldata);
- if (current_rpc_client != task->tk_client) {
+ if (clp->cl_cb_client != task->tk_client) {
/* We're shutting down or changing cl_cb_client; leave
* it to nfsd4_process_cb_update to restart the call if
* necessary. */
@@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
if (cb->cb_done)
return;
- switch (task->tk_status) {
+
+ switch (cb->cb_ops->done(cb, task)) {
case 0:
- cb->cb_done = true;
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
return;
- case -EBADHANDLE:
- case -NFS4ERR_BAD_STATEID:
- /* Race: client probably got cb_recall
- * before open reply granting delegation */
+ case 1:
break;
- default:
+ case -1:
/* Network partition? */
nfsd4_mark_cb_down(clp, task->tk_status);
+ break;
+ default:
+ BUG();
}
- if (dp->dl_retries--) {
- rpc_delay(task, 2*HZ);
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- return;
- }
- nfsd4_mark_cb_down(clp, task->tk_status);
cb->cb_done = true;
}
-static void nfsd4_cb_recall_release(void *calldata)
+static void nfsd4_cb_release(void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
if (cb->cb_done) {
spin_lock(&clp->cl_lock);
list_del(&cb->cb_per_client);
spin_unlock(&clp->cl_lock);
- nfs4_put_stid(&dp->dl_stid);
+
+ cb->cb_ops->release(cb);
}
}
-static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+static const struct rpc_call_ops nfsd4_cb_ops = {
.rpc_call_prepare = nfsd4_cb_prepare,
- .rpc_call_done = nfsd4_cb_recall_done,
- .rpc_release = nfsd4_cb_recall_release,
+ .rpc_call_done = nfsd4_cb_done,
+ .rpc_release = nfsd4_cb_release,
};
int nfsd4_create_callback_queue(void)
@@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
* instead, nfsd4_run_cb_null() will detect the killed
* client, destroy the rpc client, and stop:
*/
- do_probe_callback(clp);
+ nfsd4_run_cb(&clp->cl_cb_null);
flush_workqueue(callback_wq);
}
-static void nfsd4_release_cb(struct nfsd4_callback *cb)
-{
- if (cb->cb_ops->rpc_release)
- cb->cb_ops->rpc_release(cb);
-}
-
/* requires cl_lock: */
static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
{
@@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
}
/* Yay, the callback channel's back! Restart any callbacks: */
list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
- run_nfsd4_cb(cb);
+ queue_work(callback_wq, &cb->cb_work);
}
static void
-nfsd4_run_callback_rpc(struct nfsd4_callback *cb)
+nfsd4_run_cb_work(struct work_struct *work)
{
+ struct nfsd4_callback *cb =
+ container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+
if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
if (!clnt) {
/* Callback channel broken, or client killed; give up: */
- nfsd4_release_cb(cb);
+ if (cb->cb_ops && cb->cb_ops->release)
+ cb->cb_ops->release(cb);
return;
}
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
- cb->cb_ops, cb);
+ cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
}
-void
-nfsd4_run_cb_null(struct work_struct *w)
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
- struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
- cb_work);
- nfsd4_run_callback_rpc(cb);
-}
-
-void
-nfsd4_run_cb_recall(struct work_struct *w)
-{
- struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
- cb_work);
-
- nfsd4_prepare_cb_recall(cb->cb_op);
- nfsd4_run_callback_rpc(cb);
-}
-
-void nfsd4_cb_recall(struct nfs4_delegation *dp)
-{
- struct nfsd4_callback *cb = &dp->dl_recall;
- struct nfs4_client *clp = dp->dl_stid.sc_client;
-
- dp->dl_retries = 1;
- cb->cb_op = dp;
cb->cb_clp = clp;
- cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
-
- cb->cb_ops = &nfsd4_cb_recall_ops;
-
+ cb->cb_ops = ops;
+ INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
INIT_LIST_HEAD(&cb->cb_per_client);
cb->cb_done = true;
+}
- run_nfsd4_cb(&dp->dl_recall);
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+ queue_work(callback_wq, &cb->cb_work);
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
memset(&ent, 0, sizeof(ent));
/* Authentication name */
- if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
/* Name */
error = -EINVAL;
len = qword_get(&buf, buf1, PAGE_SIZE);
- if (len < 0)
+ if (len < 0 || len >= IDMAP_NAMESZ)
goto out;
if (len == 0)
set_bit(CACHE_NEGATIVE, &ent.h.flags);
- else if (len >= IDMAP_NAMESZ)
- goto out;
else
memcpy(ent.name, buf1, sizeof(ent.name));
error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
goto out;
cache_put(&res->h, cd);
-
error = 0;
out:
kfree(buf1);
-
return error;
}
-
static struct ent *
idtoname_lookup(struct cache_detail *cd, struct ent *item)
{
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
{
struct ent ent, *res;
char *buf1;
- int error = -EINVAL;
+ int len, error = -EINVAL;
if (buf[buflen - 1] != '\n')
return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
memset(&ent, 0, sizeof(ent));
/* Authentication name */
- if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
/* Name */
- error = qword_get(&buf, buf1, PAGE_SIZE);
- if (error <= 0 || error >= IDMAP_NAMESZ)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.name, buf1, sizeof(ent.name));
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
error = 0;
out:
kfree(buf1);
-
return (error);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5e0dc528a0e8..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_seek *seek)
+{
+ int whence;
+ __be32 status;
+ struct file *file;
+
+ status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+ &seek->seek_stateid,
+ RD_STATE, &file);
+ if (status) {
+ dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ return status;
+ }
+
+ switch (seek->seek_whence) {
+ case NFS4_CONTENT_DATA:
+ whence = SEEK_DATA;
+ break;
+ case NFS4_CONTENT_HOLE:
+ whence = SEEK_HOLE;
+ break;
+ default:
+ status = nfserr_union_notsupp;
+ goto out;
+ }
+
+ /*
+ * Note: This call does change file->f_pos, but nothing in NFSD
+ * should ever file->f_pos.
+ */
+ seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+ if (seek->seek_pos < 0)
+ status = nfserrno(seek->seek_pos);
+ else if (seek->seek_pos >= i_size_read(file_inode(file)))
+ seek->seek_eof = true;
+
+out:
+ fput(file);
+ return status;
+}
+
/* This routine never returns NFS_OK! If there are no other errors, it
* will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
* attributes matched. VERIFY is implemented by mapping NFSERR_SAME
@@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+
+ /* NFSv4.2 operations */
+ [OP_SEEK] = {
+ .op_func = (nfsd4op_func)nfsd4_seek,
+ .op_name = "OP_SEEK",
+ },
};
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..ea95a2bc21b5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
void (*create)(struct nfs4_client *);
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
- void (*grace_done)(struct nfsd_net *, time_t);
+ void (*grace_done)(struct nfsd_net *);
};
/* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
status = mnt_want_write_file(nn->rec_file);
if (status)
- return;
+ goto out_creds;
dir = nn->rec_file->f_path.dentry;
/* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
user_recovery_dirname);
}
mnt_drop_write_file(nn->rec_file);
+out_creds:
nfs4_reset_creds(original_cred);
}
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
}
static void
-nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
{
int status;
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
return status;
}
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!nn->rec_file)
+ return;
+ fput(nn->rec_file);
+ nn->rec_file = NULL;
+}
static int
nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
int status;
status = nfsd4_init_recdir(net);
- if (!status)
- status = nfsd4_recdir_load(net);
if (status)
- printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+
+ status = nfsd4_recdir_load(net);
+ if (status)
+ nfsd4_shutdown_recdir(net);
+
return status;
}
@@ -546,21 +560,12 @@ err:
}
static void
-nfsd4_shutdown_recdir(struct nfsd_net *nn)
-{
- if (!nn->rec_file)
- return;
- fput(nn->rec_file);
- nn->rec_file = NULL;
-}
-
-static void
nfsd4_legacy_tracking_exit(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfs4_release_reclaim(nn);
- nfsd4_shutdown_recdir(nn);
+ nfsd4_shutdown_recdir(net);
nfs4_legacy_state_shutdown(net);
}
@@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
}
static void
-nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn)
{
int ret;
struct cld_upcall *cup;
@@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
}
cup->cu_msg.cm_cmd = Cld_GraceDone;
- cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+ cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
if (!ret)
ret = cup->cu_msg.cm_status;
@@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
static char *
nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
return result;
}
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + Y/N character + terminating NULL */
+ len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+ clp->cl_minorversion ? 'Y' : 'N');
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + max width of int64_t string + terminating NULL */
+ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+ grace_start);
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
static int
-nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
{
- char *envp[2];
+ char *envp[3];
char *argv[4];
int ret;
@@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
dprintk("%s: cmd: %s\n", __func__, cmd);
dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
- dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+ dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+ dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
- envp[0] = legacy;
- envp[1] = NULL;
+ envp[0] = env0;
+ envp[1] = env1;
+ envp[2] = NULL;
argv[0] = (char *)cltrack_prog;
argv[1] = cmd;
@@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
}
static int
-nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+nfsd4_umh_cltrack_init(struct net *net)
{
+ int ret;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
"tracking in a container!\n");
return -EINVAL;
}
- return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+
+ ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+ kfree(grace_start);
+ return ret;
+}
+
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+ wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
}
static void
nfsd4_umh_cltrack_create(struct nfs4_client *clp)
{
- char *hexid;
+ char *hexid, *has_session, *grace_start;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ /*
+ * With v4.0 clients, there's little difference in outcome between a
+ * create and check operation, and we can end up calling into this
+ * function multiple times per client (once for each openowner). So,
+ * for v4.0 clients skip upcalling once the client has been recorded
+ * on stable storage.
+ *
+ * For v4.1+ clients, the outcome of the two operations is different,
+ * so we must ensure that we upcall for the create operation. v4.1+
+ * clients call this on RECLAIM_COMPLETE though, so we should only end
+ * up doing a single create upcall per client.
+ */
+ if (clp->cl_minorversion == 0 &&
+ test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
+ grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
+ kfree(has_session);
+ kfree(grace_start);
kfree(hexid);
}
@@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
{
char *hexid;
+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
kfree(hexid);
}
@@ -1230,30 +1348,45 @@ static int
nfsd4_umh_cltrack_check(struct nfs4_client *clp)
{
int ret;
- char *hexid, *legacy;
+ char *hexid, *has_session, *legacy;
+
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return -ENOMEM;
}
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
- ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+ ret = 0;
+ } else {
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+ if (ret == 0)
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+ nfsd4_cltrack_upcall_unlock(clp);
+ kfree(has_session);
kfree(legacy);
kfree(hexid);
+
return ret;
}
static void
-nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
- time_t boot_time)
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
{
char *legacy;
char timestr[22]; /* FIXME: better way to determine max size? */
- sprintf(timestr, "%ld", boot_time);
+ sprintf(timestr, "%ld", nn->boot_time);
legacy = nfsd4_cltrack_legacy_topdir();
- nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
kfree(legacy);
}
@@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
}
void
-nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn)
{
if (nn->client_tracking_ops)
- nn->client_tracking_ops->grace_done(nn, boot_time);
+ nn->client_tracking_ops->grace_done(nn);
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e80a59e7e91..5c0cac173068 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab;
static void free_session(struct nfsd4_session *);
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+
static bool is_session_dead(struct nfsd4_session *ses)
{
return ses->se_flags & NFS4_SESSION_DEAD;
@@ -645,7 +647,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
- INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall);
+ dp->dl_retries = 1;
+ nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+ &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
return dp;
out_dec:
atomic_long_dec(&num_delegations);
@@ -673,15 +677,20 @@ nfs4_put_stid(struct nfs4_stid *s)
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
- lockdep_assert_held(&state_lock);
+ struct file *filp = NULL;
+ struct file_lock *fl;
- if (!fp->fi_lease)
- return;
- if (atomic_dec_and_test(&fp->fi_delegees)) {
- vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_lease && atomic_dec_and_test(&fp->fi_delegees)) {
+ swap(filp, fp->fi_deleg_file);
+ fl = fp->fi_lease;
fp->fi_lease = NULL;
- fput(fp->fi_deleg_file);
- fp->fi_deleg_file = NULL;
+ }
+ spin_unlock(&fp->fi_lock);
+
+ if (filp) {
+ vfs_setlease(filp, F_UNLCK, &fl);
+ fput(filp);
}
}
@@ -717,8 +726,6 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
list_del_init(&dp->dl_recall_lru);
list_del_init(&dp->dl_perfile);
spin_unlock(&fp->fi_lock);
- if (fp)
- nfs4_put_deleg_lease(fp);
}
static void destroy_delegation(struct nfs4_delegation *dp)
@@ -726,6 +733,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
spin_lock(&state_lock);
unhash_delegation_locked(dp);
spin_unlock(&state_lock);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
@@ -735,6 +743,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+
if (clp->cl_minorversion == 0)
nfs4_put_stid(&dp->dl_stid);
else {
@@ -1635,6 +1645,7 @@ __destroy_client(struct nfs4_client *clp)
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
while (!list_empty(&clp->cl_revoked)) {
@@ -1862,7 +1873,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
free_client(clp);
return NULL;
}
- INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null);
+ nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
copy_verf(clp, verf);
@@ -3349,8 +3360,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
return ret;
}
-void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
nfsd_net_id);
@@ -3371,6 +3383,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
spin_unlock(&state_lock);
}
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+ switch (task->tk_status) {
+ case 0:
+ return 1;
+ case -EBADHANDLE:
+ case -NFS4ERR_BAD_STATEID:
+ /*
+ * Race: client probably got cb_recall before open reply
+ * granting delegation.
+ */
+ if (dp->dl_retries--) {
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ }
+ /*FALLTHRU*/
+ default:
+ return -1;
+ }
+}
+
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+ nfs4_put_stid(&dp->dl_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+ .prepare = nfsd4_cb_recall_prepare,
+ .done = nfsd4_cb_recall_done,
+ .release = nfsd4_cb_recall_release,
+};
+
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
{
/*
@@ -3381,7 +3430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* it's safe to take a reference.
*/
atomic_inc(&dp->dl_stid.sc_count);
- nfsd4_cb_recall(dp);
+ nfsd4_run_cb(&dp->dl_recall);
}
/* Called from break_lease() with i_lock held. */
@@ -3759,7 +3808,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
fl = locks_alloc_lock();
if (!fl)
return NULL;
- locks_init_lock(fl);
fl->fl_lmops = &nfsd_lease_mng_ops;
fl->fl_flags = FL_DELEG;
fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
@@ -4107,7 +4155,7 @@ out:
return status;
}
-static void
+void
nfsd4_end_grace(struct nfsd_net *nn)
{
/* do nothing if grace period already ended */
@@ -4116,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
dprintk("NFSD: end of grace period\n");
nn->grace_ended = true;
- nfsd4_record_grace_done(nn, nn->boot_time);
+ /*
+ * If the server goes down again right now, an NFSv4
+ * client will still be allowed to reclaim after it comes back up,
+ * even if it hasn't yet had a chance to reclaim state this time.
+ *
+ */
+ nfsd4_record_grace_done(nn);
+ /*
+ * At this point, NFSv4 clients can still reclaim. But if the
+ * server crashes, any that have not yet reclaimed will be out
+ * of luck on the next boot.
+ *
+ * (NFSv4.1+ clients are considered to have reclaimed once they
+ * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
+ * have reclaimed after their first OPEN.)
+ */
locks_end_grace(&nn->nfsd4_manager);
/*
- * Now that every NFSv4 client has had the chance to recover and
- * to see the (possibly new, possibly shorter) lease time, we
- * can safely set the next grace time to the current lease time:
+ * At this point, and once lockd and/or any other containers
+ * exit their grace period, further reclaims will fail and
+ * regular locking can resume.
*/
- nn->nfsd4_grace = nn->nfsd4_lease;
}
static time_t
@@ -5210,7 +5272,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
fp = lock_stp->st_stid.sc_file;
- locks_init_lock(file_lock);
switch (lock->lk_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
@@ -5354,7 +5415,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_jukebox;
goto out;
}
- locks_init_lock(file_lock);
+
switch (lockt->lt_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
@@ -5432,7 +5493,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_jukebox;
goto fput;
}
- locks_init_lock(file_lock);
+
file_lock->fl_type = F_UNLCK;
file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
file_lock->fl_pid = current->tgid;
@@ -5645,6 +5706,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
if (status)
return nfserr_reclaim_bad;
+ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+ return nfserr_no_grace;
+
if (nfsd4_client_record_check(cstate->clp))
return nfserr_reclaim_bad;
@@ -6342,10 +6406,10 @@ nfs4_state_start_net(struct net *net)
ret = nfs4_state_create_net(net);
if (ret)
return ret;
- nfsd4_client_tracking_init(net);
nn->boot_time = get_seconds();
- locks_start_grace(net, &nn->nfsd4_manager);
nn->grace_ended = false;
+ locks_start_grace(net, &nn->nfsd4_manager);
+ nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
nn->nfsd4_grace, net);
queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
@@ -6402,6 +6466,7 @@ nfs4_state_shutdown_net(struct net *net)
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f9821ce6658a..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -31,13 +31,6 @@
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * TODO: Neil Brown made the following observation: We currently
- * initially reserve NFSD_BUFSIZE space on the transmit queue and
- * never release any of that until the request is complete.
- * It would be good to calculate a new maximum response size while
- * decoding the COMPOUND, and call svc_reserve with this number
- * at the end of nfs4svc_decode_compoundargs.
*/
#include <linux/slab.h>
@@ -1521,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
}
static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 4);
+ p = xdr_decode_hyper(p, &seek->seek_offset);
+ seek->seek_whence = be32_to_cpup(p);
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
{
return nfs_ok;
@@ -1593,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+
+ /* new operations for NFSv4.2 */
+ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
+ [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
};
static inline bool
@@ -1670,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
readbytes += nfsd4_max_reply(argp->rqstp, op);
} else
max_reply += nfsd4_max_reply(argp->rqstp, op);
+ /*
+ * OP_LOCK may return a conflicting lock. (Special case
+ * because it will just skip encoding this if it runs
+ * out of xdr buffer space, and it is the only operation
+ * that behaves this way.)
+ */
+ if (op->opnum == OP_LOCK)
+ max_reply += NFS4_OPAQUE_LIMIT;
if (op->status) {
argp->opcnt = i+1;
@@ -2657,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
struct xdr_stream *xdr = cd->xdr;
int start_offset = xdr->buf->len;
int cookie_offset;
+ u32 name_and_cookie;
int entry_bytes;
__be32 nfserr = nfserr_toosmall;
__be64 wire_offset;
@@ -2718,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
cd->rd_maxcount -= entry_bytes;
if (!cd->rd_dircount)
goto fail;
- cd->rd_dircount--;
+ /*
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+ * let's always let through the first entry, at least:
+ */
+ name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
@@ -3096,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
buf->page_len = maxcount;
buf->len += maxcount;
- xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+ xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ / PAGE_SIZE;
/* Use rest of head for padding and remaining ops: */
buf->tail[0].iov_base = xdr->p;
@@ -3321,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
maxcount = min_t(int, maxcount-16, bytes_left);
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ if (!readdir->rd_dircount)
+ readdir->rd_dircount = INT_MAX;
+
readdir->xdr = xdr;
readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
@@ -3751,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_seek *seek)
+{
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8);
+ *p++ = cpu_to_be32(seek->seek_eof);
+ p = xdr_encode_hyper(p, seek->seek_pos);
+
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
{
return nfserr;
@@ -3822,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* NFSv4.2 operations */
+ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
+ [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
};
/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ff9567633245..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,8 +27,12 @@
*/
#define TARGET_BUCKET_SIZE 64
-static struct hlist_head * cache_hash;
-static struct list_head lru_head;
+struct nfsd_drc_bucket {
+ struct list_head lru_head;
+ spinlock_t cache_lock;
+};
+
+static struct nfsd_drc_bucket *drc_hashtbl;
static struct kmem_cache *drc_slab;
/* max number of entries allowed in the cache */
@@ -36,6 +40,7 @@ static unsigned int max_drc_entries;
/* number of significant bits in the hash value */
static unsigned int maskbits;
+static unsigned int drc_hashsize;
/*
* Stats and other tracking of on the duplicate reply cache. All of these and
@@ -43,7 +48,7 @@ static unsigned int maskbits;
*/
/* total number of entries */
-static unsigned int num_drc_entries;
+static atomic_t num_drc_entries;
/* cache misses due only to checksum comparison failures */
static unsigned int payload_misses;
@@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
* A cache entry is "single use" if c_state == RC_INPROG
* Otherwise, it when accessing _prev or _next, the lock must be held.
*/
-static DEFINE_SPINLOCK(cache_lock);
static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
/*
@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
+static u32
+nfsd_cache_hash(__be32 xid)
+{
+ return hash_32(be32_to_cpu(xid), maskbits);
+}
+
static struct svc_cacherep *
nfsd_reply_cache_alloc(void)
{
@@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void)
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
INIT_LIST_HEAD(&rp->c_lru);
- INIT_HLIST_NODE(&rp->c_hash);
}
return rp;
}
@@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- if (!hlist_unhashed(&rp->c_hash))
- hlist_del(&rp->c_hash);
list_del(&rp->c_lru);
- --num_drc_entries;
+ atomic_dec(&num_drc_entries);
drc_mem_usage -= sizeof(*rp);
kmem_cache_free(drc_slab, rp);
}
static void
-nfsd_reply_cache_free(struct svc_cacherep *rp)
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
nfsd_reply_cache_free_locked(rp);
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
}
int nfsd_reply_cache_init(void)
{
unsigned int hashsize;
+ unsigned int i;
- INIT_LIST_HEAD(&lru_head);
max_drc_entries = nfsd_cache_size_limit();
- num_drc_entries = 0;
+ atomic_set(&num_drc_entries, 0);
hashsize = nfsd_hashsize(max_drc_entries);
maskbits = ilog2(hashsize);
@@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void)
if (!drc_slab)
goto out_nomem;
- cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
- if (!cache_hash)
+ drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
+ if (!drc_hashtbl)
goto out_nomem;
+ for (i = 0; i < hashsize; i++) {
+ INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+ spin_lock_init(&drc_hashtbl[i].cache_lock);
+ }
+ drc_hashsize = hashsize;
return 0;
out_nomem:
@@ -184,17 +196,22 @@ out_nomem:
void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
cancel_delayed_work_sync(&cache_cleaner);
- while (!list_empty(&lru_head)) {
- rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ for (i = 0; i < drc_hashsize; i++) {
+ struct list_head *head = &drc_hashtbl[i].lru_head;
+ while (!list_empty(head)) {
+ rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ nfsd_reply_cache_free_locked(rp);
+ }
}
- kfree (cache_hash);
- cache_hash = NULL;
+ kfree (drc_hashtbl);
+ drc_hashtbl = NULL;
+ drc_hashsize = 0;
if (drc_slab) {
kmem_cache_destroy(drc_slab);
@@ -207,61 +224,63 @@ void nfsd_reply_cache_shutdown(void)
* not already scheduled.
*/
static void
-lru_put_end(struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
- list_move_tail(&rp->c_lru, &lru_head);
+ list_move_tail(&rp->c_lru, &b->lru_head);
schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
-/*
- * Move a cache entry from one hash list to another
- */
-static void
-hash_refile(struct svc_cacherep *rp)
-{
- hlist_del_init(&rp->c_hash);
- /*
- * No point in byte swapping c_xid since we're just using it to pick
- * a hash bucket.
- */
- hlist_add_head(&rp->c_hash, cache_hash +
- hash_32((__force u32)rp->c_xid, maskbits));
-}
-
-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
- */
static long
-prune_cache_entries(void)
+prune_bucket(struct nfsd_drc_bucket *b)
{
struct svc_cacherep *rp, *tmp;
long freed = 0;
- list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
/*
* Don't free entries attached to calls that are still
* in-progress, but do keep scanning the list.
*/
if (rp->c_state == RC_INPROG)
continue;
- if (num_drc_entries <= max_drc_entries &&
+ if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
freed++;
}
+ return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(void)
+{
+ unsigned int i;
+ long freed = 0;
+ bool cancel = true;
+
+ for (i = 0; i < drc_hashsize; i++) {
+ struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+
+ if (list_empty(&b->lru_head))
+ continue;
+ spin_lock(&b->cache_lock);
+ freed += prune_bucket(b);
+ if (!list_empty(&b->lru_head))
+ cancel = false;
+ spin_unlock(&b->cache_lock);
+ }
/*
- * Conditionally rearm the job. If we cleaned out the list, then
- * cancel any pending run (since there won't be any work to do).
- * Otherwise, we rearm the job or modify the existing one to run in
- * RC_EXPIRE since we just ran the pruner.
+ * Conditionally rearm the job to run in RC_EXPIRE since we just
+ * ran the pruner.
*/
- if (list_empty(&lru_head))
- cancel_delayed_work(&cache_cleaner);
- else
+ if (!cancel)
mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
@@ -269,32 +288,19 @@ prune_cache_entries(void)
static void
cache_cleaner_func(struct work_struct *unused)
{
- spin_lock(&cache_lock);
prune_cache_entries();
- spin_unlock(&cache_lock);
}
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long num;
-
- spin_lock(&cache_lock);
- num = num_drc_entries;
- spin_unlock(&cache_lock);
-
- return num;
+ return atomic_read(&num_drc_entries);
}
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long freed;
-
- spin_lock(&cache_lock);
- freed = prune_cache_entries();
- spin_unlock(&cache_lock);
- return freed;
+ return prune_cache_entries();
}
/*
* Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -332,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
static bool
nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
{
- /* Check RPC header info first */
- if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ /* Check RPC XID first */
+ if (rqstp->rq_xid != rp->c_xid)
return false;
-
/* compare checksum of NFS data */
if (csum != rp->c_csum) {
++payload_misses;
return false;
}
+ /* Other discriminators */
+ if (rqstp->rq_proc != rp->c_proc ||
+ rqstp->rq_prot != rp->c_prot ||
+ rqstp->rq_vers != rp->c_vers ||
+ rqstp->rq_arg.len != rp->c_len ||
+ !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+ rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return false;
+
return true;
}
@@ -355,18 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
* NULL on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
+ __wsum csum)
{
struct svc_cacherep *rp, *ret = NULL;
- struct hlist_head *rh;
+ struct list_head *rh = &b->lru_head;
unsigned int entries = 0;
- /*
- * No point in byte swapping rq_xid since we're just using it to pick
- * a hash bucket.
- */
- rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)];
- hlist_for_each_entry(rp, rh, c_hash) {
+ list_for_each_entry(rp, rh, c_lru) {
++entries;
if (nfsd_cache_match(rqstp, csum, rp)) {
ret = rp;
@@ -377,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
- longest_chain_cachesize = num_drc_entries;
+ longest_chain_cachesize = atomic_read(&num_drc_entries);
} else if (entries == longest_chain) {
/* prefer to keep the smallest cachesize possible here */
- longest_chain_cachesize = min(longest_chain_cachesize,
- num_drc_entries);
+ longest_chain_cachesize = min_t(unsigned int,
+ longest_chain_cachesize,
+ atomic_read(&num_drc_entries));
}
return ret;
@@ -403,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
__wsum csum;
+ u32 hash = nfsd_cache_hash(xid);
+ struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
unsigned long age;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -420,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
rp = nfsd_reply_cache_alloc();
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
if (likely(rp)) {
- ++num_drc_entries;
+ atomic_inc(&num_drc_entries);
drc_mem_usage += sizeof(*rp);
}
/* go ahead and prune the cache */
- prune_cache_entries();
+ prune_bucket(b);
- found = nfsd_cache_search(rqstp, csum);
+ found = nfsd_cache_search(b, rqstp, csum);
if (found) {
if (likely(rp))
nfsd_reply_cache_free_locked(rp);
@@ -454,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_len = rqstp->rq_arg.len;
rp->c_csum = csum;
- hash_refile(rp);
- lru_put_end(rp);
+ lru_put_end(b, rp);
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -465,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
}
rp->c_type = RC_NOCACHE;
out:
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return rtn;
found_entry:
nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rtn = RC_DROPIT;
/* Request being processed or excessive rexmits */
@@ -527,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
+ u32 hash;
+ struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
if (!rp)
return;
+ hash = nfsd_cache_hash(rp->c_xid);
+ b = &drc_hashtbl[hash];
+
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
/* Don't cache excessive amounts of data and XDR failures */
if (!statp || len > (256 >> 2)) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
@@ -553,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
bufsize = len << 2;
cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
if (!cachv->iov_base) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
cachv->iov_len = bufsize;
memcpy(cachv->iov_base, statp, bufsize);
break;
case RC_NOCACHE:
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
drc_mem_usage += bufsize;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return;
}
@@ -600,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
*/
static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
- spin_lock(&cache_lock);
seq_printf(m, "max entries: %u\n", max_drc_entries);
- seq_printf(m, "num entries: %u\n", num_drc_entries);
+ seq_printf(m, "num entries: %u\n",
+ atomic_read(&num_drc_entries));
seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
seq_printf(m, "mem usage: %u\n", drc_mem_usage);
seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
@@ -611,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "payload misses: %u\n", payload_misses);
seq_printf(m, "longest chain len: %u\n", longest_chain);
seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
- spin_unlock(&cache_lock);
return 0;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
NFSD_Leasetime,
NFSD_Gracetime,
NFSD_RecoveryDir,
+ NFSD_V4EndGrace,
#endif
};
@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
#endif
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Leasetime] = write_leasetime,
[NFSD_Gracetime] = write_gracetime,
[NFSD_RecoveryDir] = write_recoverydir,
+ [NFSD_V4EndGrace] = write_v4_end_grace,
#endif
};
@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
return rv;
}
+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: any value
+ * size: non-zero length of C string in @buf
+ * Output:
+ * passed-in buffer filled with "Y" or "N" with a newline
+ * and NULL-terminated C string. This indicates whether
+ * the grace period has ended in the current net
+ * namespace. Return code is the size in bytes of the
+ * string. Writing a string that starts with 'Y', 'y', or
+ * '1' to the file will end the grace period for nfsd's v4
+ * lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (size > 0) {
+ switch(buf[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ nfsd4_end_grace(nn);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+ nn->grace_ended ? 'Y' : 'N');
+}
+
#endif
/*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
#endif
/* last one */ {""}
};
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847daf37e566..747f3b95bd11 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
-#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e883a5868be6..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
* fix that case easily.
*/
struct cred *new = prepare_creds();
- if (!new)
- return nfserrno(-ENOMEM);
+ if (!new) {
+ error = nfserrno(-ENOMEM);
+ goto out;
+ }
new->cap_effective =
cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4a89e00d7461..0a47c6a6b301 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,16 +62,21 @@ typedef struct {
(s)->si_generation
struct nfsd4_callback {
- void *cb_op;
struct nfs4_client *cb_clp;
struct list_head cb_per_client;
u32 cb_minorversion;
struct rpc_message cb_msg;
- const struct rpc_call_ops *cb_ops;
+ struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
bool cb_done;
};
+struct nfsd4_callback_ops {
+ void (*prepare)(struct nfsd4_callback *);
+ int (*done)(struct nfsd4_callback *, struct rpc_task *);
+ void (*release)(struct nfsd4_callback *);
+};
+
/*
* A core object that represents a "common" stateid. These are generally
* embedded within the different (more specific) stateid objects and contain
@@ -127,6 +132,9 @@ struct nfs4_delegation {
struct nfsd4_callback dl_recall;
};
+#define cb_to_delegation(cb) \
+ container_of(cb, struct nfs4_delegation, dl_recall)
+
/* client delegation callback info */
struct nfs4_cb_conn {
/* SETCLIENTID info */
@@ -306,6 +314,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
@@ -517,6 +526,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
+enum nfsd4_cb_op {
+ NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_SEQUENCE,
+};
+
+
struct nfsd4_compound_state;
struct nfsd_net;
@@ -531,12 +547,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
extern int set_callback_cred(void);
-void nfsd4_run_cb_null(struct work_struct *w);
-void nfsd4_run_cb_recall(struct work_struct *w);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -545,13 +561,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
+
/* nfs4recover operations */
extern int nfsd4_client_tracking_init(struct net *net);
extern void nfsd4_client_tracking_exit(struct net *net);
extern void nfsd4_client_record_create(struct nfs4_client *clp);
extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
/* nfs fault injection functions */
#ifdef CONFIG_NFSD_FAULT_INJECTION
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f501a9b5c9df..965cffd17a0c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
if (err)
goto out;
size_change = 1;
+
+ /*
+ * RFC5661, Section 18.30.4:
+ * Changing the size of a file with SETATTR indirectly
+ * changes the time_modify and change attributes.
+ *
+ * (and similar for the older RFCs)
+ */
+ if (iap->ia_size != i_size_read(inode))
+ iap->ia_valid |= ATTR_MTIME;
}
iap->ia_valid |= ATTR_CTIME;
@@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
{
struct path path;
struct inode *inode;
+ struct file *file;
int flags = O_RDONLY|O_LARGEFILE;
__be32 err;
int host_err = 0;
@@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
else
flags = O_WRONLY|O_LARGEFILE;
}
- *filp = dentry_open(&path, flags, current_cred());
- if (IS_ERR(*filp)) {
- host_err = PTR_ERR(*filp);
- *filp = NULL;
- } else {
- host_err = ima_file_check(*filp, may_flags);
- if (may_flags & NFSD_MAY_64BIT_COOKIE)
- (*filp)->f_mode |= FMODE_64BITHASH;
- else
- (*filp)->f_mode |= FMODE_32BITHASH;
+ file = dentry_open(&path, flags, current_cred());
+ if (IS_ERR(file)) {
+ host_err = PTR_ERR(file);
+ goto out_nfserr;
}
+ host_err = ima_file_check(file, may_flags);
+ if (host_err) {
+ nfsd_close(file);
+ goto out_nfserr;
+ }
+
+ if (may_flags & NFSD_MAY_64BIT_COOKIE)
+ file->f_mode |= FMODE_64BITHASH;
+ else
+ file->f_mode |= FMODE_32BITHASH;
+
+ *filp = file;
out_nfserr:
err = nfserrno(host_err);
out:
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 465e7799742a..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_seek {
+ /* request */
+ stateid_t seek_stateid;
+ loff_t seek_offset;
+ u32 seek_whence;
+
+ /* response */
+ u32 seek_eof;
+ loff_t seek_pos;
+};
+
struct nfsd4_op {
int opnum;
__be32 status;
@@ -473,6 +484,9 @@ struct nfsd4_op {
struct nfsd4_reclaim_complete reclaim_complete;
struct nfsd4_test_stateid test_stateid;
struct nfsd4_free_stateid free_stateid;
+
+ /* NFSv4.2 */
+ struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..d071e7f23de2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
+#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/aio.h>
#include "nilfs.h"
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
+ struct inode *inode = page->mapping->host;
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- struct inode *inode = page->mapping->host;
unsigned nr_dirty = 0;
struct buffer_head *bh, *head;
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
+ } else if (ret) {
+ unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
struct {
struct file_handle handle;
- u8 pad[64];
+ u8 pad[MAX_HANDLE_SZ];
} f;
int size, ret, i;
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f.handle.handle_bytes >> 2;
ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
- if ((ret == 255) || (ret == -ENOSPC)) {
+ if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return 0;
}
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
}
out:
- spin_unlock(&qs->qs_lock);
- if (fence)
+ if (fence) {
+ spin_unlock(&qs->qs_lock);
o2quo_fence_self();
+ } else {
+ mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+ "connected: %d, lowest: %d (%sreachable)\n",
+ qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+ lowest_reachable ? "" : "un");
+ spin_unlock(&qs->qs_lock);
+
+ }
+
}
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..ea34952f9496 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
return ret;
}
+static int o2net_set_usertimeout(struct socket *sock)
+{
+ int user_timeout = O2NET_TCP_USER_TIMEOUT;
+
+ return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+ (char *)&user_timeout, sizeof(user_timeout));
+}
+
static void o2net_initialize_handshake(void)
{
o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
#endif
printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
- "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
- msecs / 1000, msecs % 1000);
+ "idle for %lu.%lu secs.\n",
+ SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
- /*
- * Initialize the nn_timeout so that the next connection attempt
- * will continue in o2net_start_connect.
+ /* idle timerout happen, don't shutdown the connection, but
+ * make fence decision. Maybe the connection can recover before
+ * the decision is made.
*/
atomic_set(&nn->nn_timeout, 1);
+ o2quo_conn_err(o2net_num_from_nn(nn));
+ queue_delayed_work(o2net_wq, &nn->nn_still_up,
+ msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+
+ o2net_sc_reset_idle_timer(sc);
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
}
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
{
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+ /* clear fence decision since the connection recover from timeout*/
+ if (atomic_read(&nn->nn_timeout)) {
+ o2quo_conn_up(o2net_num_from_nn(nn));
+ cancel_delayed_work(&nn->nn_still_up);
+ atomic_set(&nn->nn_timeout, 0);
+ }
+
/* Only push out an existing timer */
if (timer_pending(&sc->sc_idle_timeout))
o2net_sc_reset_idle_timer(sc);
@@ -1650,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work)
goto out;
}
+ ret = o2net_set_usertimeout(sock);
+ if (ret) {
+ mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+ goto out;
+ }
+
o2net_register_callbacks(sc->sc_sock->sk, sc);
spin_lock(&nn->nn_lock);
@@ -1831,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
goto out;
}
+ ret = o2net_set_usertimeout(new_sock);
+ if (ret) {
+ mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+ goto out;
+ }
+
slen = sizeof(sin);
ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
&slen, 1);
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
+#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
/* TODO: figure this out.... */
static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3ec906ef5d9a..12ba682fc53c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -655,12 +655,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
clear_bit(bit, res->refmap);
}
-
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- assert_spin_locked(&res->spinlock);
-
res->inflight_locks++;
mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +665,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
__builtin_return_address(0));
}
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ __dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -894,10 +898,8 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* Grab inflight ref to pin the resource */
- spin_lock(&res->spinlock);
- dlm_lockres_grab_inflight_ref(dlm, res);
- spin_unlock(&res->spinlock);
+ /* since this lockres is new it doesn't not require the spinlock */
+ __dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2039,10 @@ kill:
"and killing the other node now! This node is OK and can continue.\n");
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
+ spin_lock(&dlm->master_lock);
+ if (mle)
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
*ret_data = (void *)res;
dlm_put(dlm);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
/*
- * This call is void because we are already reporting an error that may
- * be -EFAULT. The error will be returned from the ioctl(2) call. It's
- * just a best-effort to tell userspace that this request caused the error.
+ * This is just a best-effort to tell userspace that this request
+ * caused the error.
*/
static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
static int ocfs2_info_handle_blocksize(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_blocksize oib;
if (o2info_from_user(oib, req))
- goto bail;
+ return -EFAULT;
oib.ib_blocksize = inode->i_sb->s_blocksize;
o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oib.ib_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_clustersize(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_clustersize oic;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oic, req))
- goto bail;
+ return -EFAULT;
oic.ic_clustersize = osb->s_clustersize;
o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oic.ic_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_maxslots(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_maxslots oim;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oim, req))
- goto bail;
+ return -EFAULT;
oim.im_max_slots = osb->max_slots;
o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oim.im_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_label(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_label oil;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oil, req))
- goto bail;
+ return -EFAULT;
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oil.il_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_uuid(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_uuid oiu;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oiu, req))
- goto bail;
+ return -EFAULT;
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oiu.iu_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_fs_features(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_fs_features oif;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oif, req))
- goto bail;
+ return -EFAULT;
oif.if_compat_features = osb->s_feature_compat;
oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oif.if_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_journal_size(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_journal_size oij;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oij, req))
- goto bail;
+ return -EFAULT;
oij.ij_journal_size = i_size_read(osb->journal->j_inode);
o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oij.ij_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
u32 i;
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ int status, type = INODE_ALLOC_SYSTEM_INODE;
struct ocfs2_info_freeinode *oifi = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oifi, req))
- goto bail;
+ if (o2info_from_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
oifi->ifi_slotnum = osb->max_slots;
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
o2info_set_request_filled(&oifi->ifi_req);
- if (o2info_to_user(*oifi, req))
- goto bail;
+ if (o2info_to_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
status = 0;
bail:
if (status)
o2info_set_request_error(&oifi->ifi_req, req);
-
+out_free:
kfree(oifi);
out_err:
return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
{
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+ int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
struct ocfs2_info_freefrag *oiff;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oiff, req))
- goto bail;
+ if (o2info_from_user(*oiff, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
/*
* chunksize from userspace should be power of 2.
*/
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
if (o2info_to_user(*oiff, req)) {
status = -EFAULT;
- goto bail;
+ goto out_free;
}
status = 0;
bail:
if (status)
o2info_set_request_error(&oiff->iff_req, req);
-
+out_free:
kfree(oiff);
out_err:
return status;
@@ -727,23 +690,17 @@ out_err:
static int ocfs2_info_handle_unknown(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_request oir;
if (o2info_from_user(oir, req))
- goto bail;
+ return -EFAULT;
o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oir, req);
-
- return status;
+ return 0;
}
/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..4142546aedae 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2532,6 +2532,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
+ kfree(osb->vol_label);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
* other children
*/
if (child && list_empty(&child->mnt_mounts)) {
+ list_del_init(&child->mnt_child);
hlist_del_init_rcu(&child->mnt_hash);
hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c42d64..c34156888d70 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -931,23 +931,32 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
- unsigned long vm_end;
+ /* End of address space hole, which we mark as non-present. */
+ unsigned long hole_end;
- if (!vma) {
- vm_end = end;
- } else {
- vm_end = min(end, vma->vm_end);
- if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ if (vma)
+ hole_end = min(end, vma->vm_start);
+ else
+ hole_end = end;
+
+ for (; addr < hole_end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ goto out;
}
- for (; addr < vm_end; addr += PAGE_SIZE) {
+ if (!vma)
+ break;
+
+ /* Addresses in the VMA. */
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
goto out;
}
}
-
out:
return err;
}
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
* include/linux/fs.h). We don't necessarily hold i_mutex when this
* is called, so take i_lock for that case.
*
- * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+ * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
* two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
* for that case too, and do both at once by combining the tests.
*
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
return ret;
return __sync_filesystem(sb, 1);
}
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
}
-struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
struct udf_inode_info *iinfo;
struct udf_inode_info *dinfo = UDF_I(dir);
struct logicalVolIntegrityDescImpUse *lvidiu;
+ int err;
inode = new_inode(sb);
- if (!inode) {
- *err = -ENOMEM;
- return NULL;
- }
- *err = -ENOSPC;
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
iinfo = UDF_I(inode);
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
}
if (!iinfo->i_ext.i_data) {
iput(inode);
- *err = -ENOMEM;
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
+ err = -ENOSPC;
block = udf_new_block(dir->i_sb, NULL,
dinfo->i_location.partitionReferenceNum,
- start, err);
- if (*err) {
+ start, &err);
+ if (err) {
iput(inode);
- return NULL;
+ return ERR_PTR(err);
}
lvidiu = udf_sb_lvidiu(sb);
if (lvidiu) {
iinfo->i_unique = lvid_get_unique_id(sb);
+ inode->i_generation = iinfo->i_unique;
mutex_lock(&sbi->s_alloc_mutex);
if (S_ISDIR(mode))
le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
inode->i_mtime = inode->i_atime = inode->i_ctime =
iinfo->i_crtime = current_fs_time(inode->i_sb);
- insert_inode_hash(inode);
+ if (unlikely(insert_inode_locked(inode) < 0)) {
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
mark_inode_dirty(inode);
- *err = 0;
return inode;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..08598843288f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
static umode_t udf_convert_permissions(struct fileEntry *);
static int udf_update_inode(struct inode *, int);
-static void udf_fill_inode(struct inode *, struct buffer_head *);
static int udf_sync_inode(struct inode *inode);
static int udf_alloc_i_data(struct inode *inode, size_t size);
static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
return 0;
}
-static void __udf_read_inode(struct inode *inode)
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
+static int udf_read_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
struct fileEntry *fe;
+ struct extendedFileEntry *efe;
uint16_t ident;
struct udf_inode_info *iinfo = UDF_I(inode);
+ struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+ struct kernel_lb_addr *iloc = &iinfo->i_location;
+ unsigned int link_count;
+ unsigned int indirections = 0;
+ int ret = -EIO;
+
+reread:
+ if (iloc->logicalBlockNum >=
+ sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+ udf_debug("block=%d, partition=%d out of range\n",
+ iloc->logicalBlockNum, iloc->partitionReferenceNum);
+ return -EIO;
+ }
/*
* Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
* i_nlink = 1
* i_op = NULL;
*/
- bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
if (!bh) {
udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
- make_bad_inode(inode);
- return;
+ return -EIO;
}
if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
ident != TAG_IDENT_USE) {
udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
inode->i_ino, ident);
- brelse(bh);
- make_bad_inode(inode);
- return;
+ goto out;
}
fe = (struct fileEntry *)bh->b_data;
+ efe = (struct extendedFileEntry *)bh->b_data;
if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
struct buffer_head *ibh;
- ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
- &ident);
+ ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
if (ident == TAG_IDENT_IE && ibh) {
- struct buffer_head *nbh = NULL;
struct kernel_lb_addr loc;
struct indirectEntry *ie;
ie = (struct indirectEntry *)ibh->b_data;
loc = lelb_to_cpu(ie->indirectICB.extLocation);
- if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
- &ident))) {
- if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- memcpy(&iinfo->i_location,
- &loc,
- sizeof(struct kernel_lb_addr));
- brelse(bh);
- brelse(ibh);
- brelse(nbh);
- __udf_read_inode(inode);
- return;
+ if (ie->indirectICB.extLength) {
+ brelse(ibh);
+ memcpy(&iinfo->i_location, &loc,
+ sizeof(struct kernel_lb_addr));
+ if (++indirections > UDF_MAX_ICB_NESTING) {
+ udf_err(inode->i_sb,
+ "too many ICBs in ICB hierarchy"
+ " (max %d supported)\n",
+ UDF_MAX_ICB_NESTING);
+ goto out;
}
- brelse(nbh);
+ brelse(bh);
+ goto reread;
}
}
brelse(ibh);
} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
udf_err(inode->i_sb, "unsupported strategy type: %d\n",
le16_to_cpu(fe->icbTag.strategyType));
- brelse(bh);
- make_bad_inode(inode);
- return;
+ goto out;
}
- udf_fill_inode(inode, bh);
-
- brelse(bh);
-}
-
-static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
-{
- struct fileEntry *fe;
- struct extendedFileEntry *efe;
- struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
- struct udf_inode_info *iinfo = UDF_I(inode);
- unsigned int link_count;
-
- fe = (struct fileEntry *)bh->b_data;
- efe = (struct extendedFileEntry *)bh->b_data;
-
if (fe->icbTag.strategyType == cpu_to_le16(4))
iinfo->i_strat4096 = 0;
else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
iinfo->i_efe = 1;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct extendedFileEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct extendedFileEntry),
inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
iinfo->i_efe = 0;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct fileEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct fileEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct fileEntry),
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_lenAlloc = le32_to_cpu(
((struct unallocSpaceEntry *)bh->b_data)->
lengthAllocDescs);
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct unallocSpaceEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct unallocSpaceEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct unallocSpaceEntry),
inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
- return;
+ return 0;
}
+ ret = -EIO;
read_lock(&sbi->s_cred_lock);
i_uid_write(inode, le32_to_cpu(fe->uid));
if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
read_unlock(&sbi->s_cred_lock);
link_count = le16_to_cpu(fe->fileLinkCount);
- if (!link_count)
- link_count = 1;
+ if (!link_count) {
+ ret = -ESTALE;
+ goto out;
+ }
set_nlink(inode, link_count);
inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1484,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
}
+ inode->i_generation = iinfo->i_unique;
switch (fe->icbTag.fileType) {
case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1534,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
default:
udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
inode->i_ino, fe->icbTag.fileType);
- make_bad_inode(inode);
- return;
+ goto out;
}
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
struct deviceSpec *dsea =
@@ -1549,8 +1545,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
le32_to_cpu(dsea->minorDeviceIdent)));
/* Developer ID ??? */
} else
- make_bad_inode(inode);
+ goto out;
}
+ ret = 0;
+out:
+ brelse(bh);
+ return ret;
}
static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1664,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
fe->permissions = cpu_to_le32(udfperms);
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
else
fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1830,32 +1830,23 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
{
unsigned long block = udf_get_lb_pblock(sb, ino, 0);
struct inode *inode = iget_locked(sb, block);
+ int err;
if (!inode)
- return NULL;
-
- if (inode->i_state & I_NEW) {
- memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
- __udf_read_inode(inode);
- unlock_new_inode(inode);
- }
+ return ERR_PTR(-ENOMEM);
- if (is_bad_inode(inode))
- goto out_iput;
+ if (!(inode->i_state & I_NEW))
+ return inode;
- if (ino->logicalBlockNum >= UDF_SB(sb)->
- s_partmaps[ino->partitionReferenceNum].s_partition_len) {
- udf_debug("block=%d, partition=%d out of range\n",
- ino->logicalBlockNum, ino->partitionReferenceNum);
- make_bad_inode(inode);
- goto out_iput;
+ memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
+ err = udf_read_inode(inode);
+ if (err < 0) {
+ iget_failed(inode);
+ return ERR_PTR(err);
}
+ unlock_new_inode(inode);
return inode;
-
- out_iput:
- iput(inode);
- return NULL;
}
int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 83a06001742b..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
NULL, 0),
};
inode = udf_iget(dir->i_sb, lb);
- if (!inode) {
- return ERR_PTR(-EACCES);
- }
+ if (IS_ERR(inode))
+ return inode;
} else
#endif /* UDF_RECOVERY */
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
loc = lelb_to_cpu(cfi.icb.extLocation);
inode = udf_iget(dir->i_sb, &loc);
- if (!inode) {
- return ERR_PTR(-EACCES);
- }
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
}
return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
}
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct inode *dir = dentry->d_parent->d_inode;
struct udf_fileident_bh fibh;
- struct inode *inode;
struct fileIdentDesc cfi, *fi;
int err;
- struct udf_inode_info *iinfo;
-
- inode = udf_new_inode(dir, mode, &err);
- if (!inode) {
- return err;
- }
-
- iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- inode->i_data.a_ops = &udf_adinicb_aops;
- else
- inode->i_data.a_ops = &udf_aops;
- inode->i_op = &udf_file_inode_operations;
- inode->i_fop = &udf_file_operations;
- mark_inode_dirty(inode);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi) {
+ if (unlikely(!fi)) {
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
- struct inode *inode;
- struct udf_inode_info *iinfo;
- int err;
+ struct inode *inode = udf_new_inode(dir, mode);
- inode = udf_new_inode(dir, mode, &err);
- if (!inode)
- return err;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
inode->i_data.a_ops = &udf_adinicb_aops;
else
inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_fop = &udf_file_operations;
mark_inode_dirty(inode);
+ return udf_add_nondir(dentry, inode);
+}
+
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode = udf_new_inode(dir, mode);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ inode->i_data.a_ops = &udf_adinicb_aops;
+ else
+ inode->i_data.a_ops = &udf_aops;
+ inode->i_op = &udf_file_inode_operations;
+ inode->i_fop = &udf_file_operations;
+ mark_inode_dirty(inode);
d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
- struct udf_fileident_bh fibh;
- struct fileIdentDesc cfi, *fi;
- int err;
- struct udf_inode_info *iinfo;
if (!old_valid_dev(rdev))
return -EINVAL;
- err = -EIO;
- inode = udf_new_inode(dir, mode, &err);
- if (!inode)
- goto out;
+ inode = udf_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- iinfo = UDF_I(inode);
init_special_inode(inode, mode, rdev);
- fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi) {
- inode_dec_link_count(inode);
- iput(inode);
- return err;
- }
- cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
- cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
- *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
- cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
- udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- mark_inode_dirty(dir);
- mark_inode_dirty(inode);
-
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- d_instantiate(dentry, inode);
- err = 0;
-
-out:
- return err;
+ return udf_add_nondir(dentry, inode);
}
static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
- err = -EIO;
- inode = udf_new_inode(dir, S_IFDIR | mode, &err);
- if (!inode)
- goto out;
+ inode = udf_new_inode(dir, S_IFDIR | mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
iinfo = UDF_I(inode);
inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
if (!fi) {
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!fi) {
clear_nlink(inode);
mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
inc_nlink(dir);
mark_inode_dirty(dir);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
static int udf_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct inode *inode;
+ struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
struct pathComponent *pc;
const char *compstart;
- struct udf_fileident_bh fibh;
struct extent_position epos = {};
int eoffset, elen = 0;
- struct fileIdentDesc *fi;
- struct fileIdentDesc cfi;
uint8_t *ea;
int err;
int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
struct udf_inode_info *iinfo;
struct super_block *sb = dir->i_sb;
- inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
- if (!inode)
- goto out;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
iinfo = UDF_I(inode);
down_write(&iinfo->i_data_sem);
@@ -1012,32 +982,15 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
mark_inode_dirty(inode);
up_write(&iinfo->i_data_sem);
- fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi)
- goto out_fail;
- cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
- cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
- if (UDF_SB(inode->i_sb)->s_lvid_bh) {
- *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
- cpu_to_le32(lvid_get_unique_id(sb));
- }
- udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- mark_inode_dirty(dir);
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- d_instantiate(dentry, inode);
- err = 0;
-
+ err = udf_add_nondir(dentry, inode);
out:
kfree(name);
return err;
out_no_entry:
up_write(&iinfo->i_data_sem);
-out_fail:
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -1222,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
struct udf_fileident_bh fibh;
if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
- goto out_unlock;
+ return ERR_PTR(-EACCES);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
@@ -1230,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
tloc = lelb_to_cpu(cfi.icb.extLocation);
inode = udf_iget(child->d_inode->i_sb, &tloc);
- if (!inode)
- goto out_unlock;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
return d_obtain_alias(inode);
-out_unlock:
- return ERR_PTR(-EACCES);
}
@@ -1252,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
loc.partitionReferenceNum = partref;
inode = udf_iget(sb, &loc);
- if (inode == NULL)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 813da94d447b..5401fc33f5cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -961,12 +961,14 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
metadata_fe = udf_iget(sb, &addr);
- if (metadata_fe == NULL)
+ if (IS_ERR(metadata_fe)) {
udf_warn(sb, "metadata inode efe not found\n");
- else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+ return metadata_fe;
+ }
+ if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
iput(metadata_fe);
- metadata_fe = NULL;
+ return ERR_PTR(-EIO);
}
return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
struct udf_part_map *map;
struct udf_meta_data *mdata;
struct kernel_lb_addr addr;
+ struct inode *fe;
map = &sbi->s_partmaps[partition];
mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Metadata file location: block = %d part = %d\n",
mdata->s_meta_file_loc, map->s_partition_num);
- mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
- mdata->s_meta_file_loc, map->s_partition_num);
-
- if (mdata->s_metadata_fe == NULL) {
+ fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
+ map->s_partition_num);
+ if (IS_ERR(fe)) {
/* mirror file entry */
udf_debug("Mirror metadata file location: block = %d part = %d\n",
mdata->s_mirror_file_loc, map->s_partition_num);
- mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
- mdata->s_mirror_file_loc, map->s_partition_num);
+ fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
+ map->s_partition_num);
- if (mdata->s_mirror_fe == NULL) {
+ if (IS_ERR(fe)) {
udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
- return -EIO;
+ return PTR_ERR(fe);
}
- }
+ mdata->s_mirror_fe = fe;
+ } else
+ mdata->s_metadata_fe = fe;
+
/*
* bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Bitmap file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
- mdata->s_bitmap_fe = udf_iget(sb, &addr);
- if (mdata->s_bitmap_fe == NULL) {
+ fe = udf_iget(sb, &addr);
+ if (IS_ERR(fe)) {
if (sb->s_flags & MS_RDONLY)
udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
- return -EIO;
+ return PTR_ERR(fe);
}
- }
+ } else
+ mdata->s_bitmap_fe = fe;
}
udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
phd->unallocSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
+ struct inode *inode;
- map->s_uspace.s_table = udf_iget(sb, &loc);
- if (!map->s_uspace.s_table) {
+ inode = udf_iget(sb, &loc);
+ if (IS_ERR(inode)) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
p_index);
- return -EIO;
+ return PTR_ERR(inode);
}
+ map->s_uspace.s_table = inode;
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
phd->freedSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
+ struct inode *inode;
- map->s_fspace.s_table = udf_iget(sb, &loc);
- if (!map->s_fspace.s_table) {
+ inode = udf_iget(sb, &loc);
+ if (IS_ERR(inode)) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
p_index);
- return -EIO;
+ return PTR_ERR(inode);
}
-
+ map->s_fspace.s_table = inode;
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
udf_debug("freedSpaceTable (part %d) @ %ld\n",
p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
struct udf_part_map *map = &sbi->s_partmaps[p_index];
sector_t vat_block;
struct kernel_lb_addr ino;
+ struct inode *inode;
/*
* VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
ino.partitionReferenceNum = type1_index;
for (vat_block = start_block;
vat_block >= map->s_partition_root &&
- vat_block >= start_block - 3 &&
- !sbi->s_vat_inode; vat_block--) {
+ vat_block >= start_block - 3; vat_block--) {
ino.logicalBlockNum = vat_block - map->s_partition_root;
- sbi->s_vat_inode = udf_iget(sb, &ino);
+ inode = udf_iget(sb, &ino);
+ if (!IS_ERR(inode)) {
+ sbi->s_vat_inode = inode;
+ break;
+ }
}
}
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* assign inodes by physical block number */
/* perhaps it's not extensible enough, but for now ... */
inode = udf_iget(sb, &rootdir);
- if (!inode) {
+ if (IS_ERR(inode)) {
udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
- ret = -EIO;
+ ret = PTR_ERR(inode);
goto error_out;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..742557be9936 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -143,7 +143,6 @@ extern int udf_expand_file_adinicb(struct inode *);
extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
extern int udf_setsize(struct inode *, loff_t);
-extern void udf_read_inode(struct inode *);
extern void udf_evict_inode(struct inode *);
extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +208,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
ufsi->i_oeftflag = 0;
ufsi->i_dir_start_lookup = 0;
memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ err = -EIO;
+ goto failed;
+ }
mark_inode_dirty(inode);
if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
fail_remove_inode:
unlock_ufs(sb);
clear_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c97990e..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete) {
- lock_ufs(inode->i_sb);
- ufs_free_inode (inode);
- unlock_ufs(inode->i_sb);
- }
+ if (want_delete)
+ ufs_free_inode(inode);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = ufs_add_link(dentry, inode);
if (!err) {
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
- lock_ufs(dir->i_sb);
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out;
+ goto out_notlocked;
+ lock_ufs(dir->i_sb);
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
inode->i_op = &ufs_symlink_inode_operations;
@@ -155,6 +157,7 @@ out_notlocked:
out_fail:
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
struct inode * inode;
int err;
- lock_ufs(dir->i_sb);
- inode_inc_link_count(dir);
-
inode = ufs_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_dir;
+ return PTR_ERR(inode);
inode->i_op = &ufs_dir_inode_operations;
inode->i_fop = &ufs_dir_operations;
@@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
inode_inc_link_count(inode);
+ lock_ufs(dir->i_sb);
+ inode_inc_link_count(dir);
+
err = ufs_make_empty(inode, dir);
if (err)
goto out_fail;
@@ -211,8 +213,8 @@ out:
out_fail:
inode_dec_link_count(inode);
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput (inode);
-out_dir:
inode_dec_link_count(dir);
unlock_ufs(dir->i_sb);
goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d32844..86df952d3e24 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents(
struct xfs_bmap_free *flist,
int num_exts)
{
- struct xfs_btree_cur *cur;
+ struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec left;
@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents(
int error = 0;
int i;
int whichfork = XFS_DATA_FORK;
- int logflags;
+ int logflags = 0;
xfs_filblks_t blockcount = 0;
int total_extents;
@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents(
}
}
- /* We are going to change core inode */
- logflags = XFS_ILOG_CORE;
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
- } else {
- cur = NULL;
- logflags |= XFS_ILOG_DEXT;
}
/*
@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents(
blockcount = left.br_blockcount +
got.br_blockcount;
xfs_iext_remove(ip, *current_ext, 1, 0);
+ logflags |= XFS_ILOG_CORE;
if (cur) {
error = xfs_btree_delete(cur, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ } else {
+ logflags |= XFS_ILOG_DEXT;
}
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents(
got.br_startoff = startoff;
}
+ logflags |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, got.br_startoff,
got.br_startblock,
@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents(
got.br_state);
if (error)
goto del_cursor;
+ } else {
+ logflags |= XFS_ILOG_DEXT;
}
(*current_ext)++;
@@ -5597,6 +5598,7 @@ del_cursor:
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- xfs_trans_log_inode(tp, ip, logflags);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
return error;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4caa54f..b984647c24db 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+ struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t end_offset;
+ loff_t offset;
+ int newly_dirty;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ end_offset = i_size_read(inode);
+ offset = page_offset(page);
+
+ spin_lock(&mapping->private_lock);
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ if (offset < end_offset)
+ set_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ offset += 1 << inode->i_blkbits;
+ } while (bh != head);
+ }
+ newly_dirty = !TestSetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ if (newly_dirty) {
+ /* sigh - __set_page_dirty() is static, so copy it here, too */
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return newly_dirty;
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
+ .set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d39a35..1707980f9a4b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,6 +1470,26 @@ xfs_collapse_file_space(
start_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len);
+ /*
+ * Writeback the entire file and force remove any post-eof blocks. The
+ * writeback prevents changes to the extent list via concurrent
+ * writeback and the eofblocks trim prevents the extent shift algorithm
+ * from running into a post-eof delalloc extent.
+ *
+ * XXX: This is a temporary fix until the extent shift loop below is
+ * converted to use offsets and lookups within the ILOCK rather than
+ * carrying around the index into the extent list for the next
+ * iteration.
+ */
+ error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ if (xfs_can_free_eofblocks(ip, true)) {
+ error = xfs_free_eofblocks(mp, ip, false);
+ if (error)
+ return error;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b1708d134..de5368c803f9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -291,12 +291,22 @@ xfs_file_read_iter(
if (inode->i_mapping->nrpages) {
ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + size - 1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
@@ -632,10 +642,19 @@ xfs_file_dio_aio_write(
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + count - 1);
if (ret)
goto out;
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + count - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
/*