summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c93
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.h2
5 files changed, 91 insertions, 57 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9a25f8d66fbc..0295048724d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
}
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+ struct ceph_cap_flush *cf)
+{
+ struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap_flush *other = NULL;
+
+ while (*p) {
+ parent = *p;
+ other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+ if (cf->tid < other->tid)
+ p = &(*p)->rb_left;
+ else if (cf->tid > other->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&cf->g_node, parent, p);
+ rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode,
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
+ __add_cap_flushing_to_mdsc(mdsc, cf);
if (list_empty(&ci->i_flushing_item)) {
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
} else {
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing (more) tid %llu\n",
+ inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
- ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+ dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ ceph_cap_string(ci->i_flushing_caps));
__ceph_flush_snaps(ci, &session, 1);
@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(ci->i_flushing_caps & ~cleaned));
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ if (list_empty(&to_remove) && !cleaned)
goto out;
ci->i_flushing_caps &= ~cleaned;
spin_lock(&mdsc->cap_dirty_lock);
+
+ if (!list_empty(&to_remove)) {
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (!cf || cf->tid > flush_tid)
+ wake_up_all(&mdsc->cap_flushing_wq);
+ }
+
if (ci->i_flushing_caps == 0) {
list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing))
@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info,
i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--;
- wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6d3f19db8c8a..3326302f5884 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
ci->i_cap_flush_tree = RB_ROOT;
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 839901f51512..31f6a78caa0a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
" dropping dirty %s state for %p %lld\n",
@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static int check_cap_flush(struct ceph_inode_info *ci,
- u64 want_flush_seq, u64 want_snap_seq)
+static int check_capsnap_flush(struct ceph_inode_info *ci,
+ u64 want_snap_seq)
{
- int ret1 = 1, ret2 = 1;
+ int ret = 1;
spin_lock(&ci->i_ceph_lock);
- if (want_flush_seq > 0 && ci->i_flushing_caps)
- ret1 = ci->i_cap_flush_seq >= want_flush_seq;
-
if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
struct ceph_cap_snap *capsnap =
list_first_entry(&ci->i_cap_snaps,
struct ceph_cap_snap, ci_item);
- ret2 = capsnap->follows >= want_snap_seq;
+ ret = capsnap->follows >= want_snap_seq;
}
spin_unlock(&ci->i_ceph_lock);
- return ret1 && ret2;
+ return ret;
+}
+
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
+{
+ struct rb_node *n;
+ struct ceph_cap_flush *cf;
+ int ret = 1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (cf && cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
+ ret = 0;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ return ret;
}
/*
* flush all dirty inode data to disk.
*
- * returns true if we've flushed through want_flush_seq
+ * returns true if we've flushed through want_flush_tid
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_seq, u64 want_snap_seq)
+ u64 want_flush_tid, u64 want_snap_seq)
{
int mds;
- dout("check_cap_flush want %lld\n", want_flush_seq);
+ dout("check_caps_flush want %llu snap want %llu\n",
+ want_flush_tid, want_snap_seq);
mutex_lock(&mdsc->mutex);
for (mds = 0; mds < mdsc->max_sessions; ) {
struct ceph_mds_session *session = mdsc->sessions[mds];
- struct inode *inode1 = NULL, *inode2 = NULL;
+ struct inode *inode = NULL;
if (!session) {
mds++;
@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_first_entry(&session->s_cap_flushing,
- struct ceph_inode_info,
- i_flushing_item);
-
- if (!check_cap_flush(ci, want_flush_seq, 0)) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n",
- &ci->vfs_inode, ci->i_cap_flush_seq,
- want_flush_seq, mds);
- inode1 = igrab(&ci->vfs_inode);
- }
- }
if (!list_empty(&session->s_cap_snaps_flushing)) {
struct ceph_cap_snap *capsnap =
list_first_entry(&session->s_cap_snaps_flushing,
struct ceph_cap_snap,
flushing_item);
struct ceph_inode_info *ci = capsnap->ci;
- if (!check_cap_flush(ci, 0, want_snap_seq)) {
+ if (!check_capsnap_flush(ci, want_snap_seq)) {
dout("check_cap_flush still flushing snap %p "
"follows %lld <= %lld to mds%d\n",
&ci->vfs_inode, capsnap->follows,
want_snap_seq, mds);
- inode2 = igrab(&ci->vfs_inode);
+ inode = igrab(&ci->vfs_inode);
}
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
- if (inode1) {
- wait_event(mdsc->cap_flushing_wq,
- check_cap_flush(ceph_inode(inode1),
- want_flush_seq, 0));
- iput(inode1);
- }
- if (inode2) {
+ if (inode) {
wait_event(mdsc->cap_flushing_wq,
- check_cap_flush(ceph_inode(inode2),
- 0, want_snap_seq));
- iput(inode2);
- }
-
- if (!inode1 && !inode2)
+ check_capsnap_flush(ceph_inode(inode),
+ want_snap_seq));
+ iput(inode);
+ } else {
mds++;
+ }
mutex_lock(&mdsc->mutex);
}
-
mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+
+ wait_event(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid));
+
+ dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
}
/*
@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
- mdsc->cap_flush_seq = 0;
mdsc->last_cap_flush_tid = 1;
+ mdsc->cap_flush_tree = RB_ROOT;
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
- want_flush = mdsc->cap_flush_seq;
+ want_flush = mdsc->last_cap_flush_tid;
spin_unlock(&mdsc->cap_dirty_lock);
down_read(&mdsc->snap_rwsem);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 19f6084203f0..470be4eb25f3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -306,8 +306,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
- u64 cap_flush_seq;
u64 last_cap_flush_tid;
+ struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cc597f52e046..94d91471165f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
struct ceph_cap_flush {
u64 tid;
int caps;
+ struct rb_node g_node;
union {
struct rb_node i_node;
struct list_head list;
@@ -304,7 +305,6 @@ struct ceph_inode_info {
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
struct list_head i_dirty_item, i_flushing_item;
- u64 i_cap_flush_seq;
/* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */