1 files changed, 200 insertions, 105 deletions
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 6aadd5e075e4..46b37f2cce7d 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -11,6 +11,131 @@
 #include "internal.h"
 
 /*
+ * Data validation is managed through a number of mechanisms from the server:
+ *
+ *  (1) On first contact with a server (such as if it has just been rebooted),
+ *      the server sends us a CB.InitCallBackState* request.
+ *
+ *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
+ *      calls, the server maintains a time-limited per-vnode promise that it
+ *      will send us a CB.CallBack request if a third party alters the vnodes
+ *      accessed.
+ *
+ *      Note that a vnode-level callbacks may also be sent for other reasons,
+ *      such as filelock release.
+ *
+ *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
+ *      calls, each server maintains a time-limited per-volume promise that it
+ *      will send us a CB.CallBack request if the RO volume is updated to a
+ *      snapshot of the RW volume ("vos release").  This is an atomic event
+ *      that cuts over all instances of the RO volume across multiple servers
+ *      simultaneously.
+ *
+ *	Note that a volume-level callbacks may also be sent for other reasons,
+ *	such as the volumeserver taking over control of the volume from the
+ *	fileserver.
+ *
+ *	Note also that each server maintains an independent time limit on an
+ *	independent callback.
+ *
+ *  (4) Certain RPC calls include a volume information record "VolSync" in
+ *      their reply.  This contains a creation date for the volume that should
+ *      remain unchanged for a RW volume (but will be changed if the volume is
+ *      restored from backup) or will be bumped to the time of snapshotting
+ *      when a RO volume is released.
+ *
+ * In order to track this events, the following are provided:
+ *
+ *	->cb_v_break.  A counter of events that might mean that the contents of
+ *	a volume have been altered since we last checked a vnode.
+ *
+ *	->cb_v_check.  A counter of the number of events that we've sent a
+ *	query to the server for.  Everything's up to date if this equals
+ *	cb_v_break.
+ *
+ *	->cb_scrub.  A counter of the number of regression events for which we
+ *	have to completely wipe the cache.
+ *
+ *	->cb_ro_snapshot.  A counter of the number of times that we've
+ *      recognised that a RO volume has been updated.
+ *
+ *	->cb_break.  A counter of events that might mean that the contents of a
+ *      vnode have been altered.
+ *
+ *	->cb_expires_at.  The time at which the callback promise expires or
+ *      AFS_NO_CB_PROMISE if we have no promise.
+ *
+ * The way we manage things is:
+ *
+ *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
+ *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
+ *      volume and volume's server record.
+ *
+ *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
+ *	callback break on all the volumes that have been using that volume
+ *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
+ *
+ *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
+ *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
+ *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
+ *	force reentry to the filesystem for revalidation.
+ *
+ *  (4) When entering the filesystem, we call afs_validate() to check the
+ *	validity of a vnode.  This first checks to see if ->cb_v_check and
+ *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
+ *	exclusively and perform an FS.FetchStatus on the vnode.
+ *
+ *	After checking the volume, we check the vnode.  If there's a mismatch
+ *	between the volume counters and the vnode's mirrors of those counters,
+ *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
+ *
+ *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
+ *      parsed:
+ *
+ *	(A) If the Creation timestamp has changed on a RW volume or regressed
+ *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
+ *	    RO volume, we assume "vos release" happened and try to increment
+ *	    ->cb_ro_snapshot.
+ *
+ *      (B) If the Update timestamp has regressed, we try to increment
+ *	    ->cb_scrub.
+ *
+ *      Note that in both of these cases, we only do the increment if we can
+ *      cmpxchg the value of the timestamp from the value we noted before the
+ *      op.  This tries to prevent parallel ops from fighting one another.
+ *
+ *	volume->cb_v_check is then set to ->cb_v_break.
+ *
+ *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
+ *	parsed and used to set the promise in ->cb_expires_at for the vnode,
+ *	the volume and the volume's server record.
+ *
+ *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
+ *      the vnode.
+ */
+
+/*
+ * Check the validity of a vnode/inode and its parent volume.
+ */
+bool afs_check_validity(const struct afs_vnode *vnode)
+{
+	const struct afs_volume *volume = vnode->volume;
+	time64_t deadline = ktime_get_real_seconds() + 10;
+
+	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
+	    volume->cb_expires_at <= deadline ||
+	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
+	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
+	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		_debug("inval");
+		return false;
+	}
+
+	return true;
+}
+
+/*
  * See if the server we've just talked to is currently excluded.
  */
 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
@@ -185,11 +310,17 @@ out:
 }
 
 /*
- * Update the state of a volume.  Returns 1 to redo the operation from the start.
+ * Update the state of a volume, including recording the expiration time of the
+ * callback promise.  Returns 1 to redo the operation from the start.
  */
 int afs_update_volume_state(struct afs_operation *op)
 {
+	struct afs_server_list *slist = op->server_list;
+	struct afs_server_entry *se = &slist->servers[op->server_index];
+	struct afs_callback *cb = &op->file[0].scb.callback;
 	struct afs_volume *volume = op->volume;
+	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
+	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
 	int ret;
 
 	_enter("%llx", op->volume->vid);
@@ -202,6 +333,18 @@ int afs_update_volume_state(struct afs_operation *op)
 		}
 	}
 
+	if (op->cb_v_break == cb_v_break &&
+	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
+		time64_t expires_at = cb->expires_at;
+
+		if (!op->file[0].scb.have_cb)
+			expires_at = op->file[1].scb.callback.expires_at;
+
+		se->cb_expires_at = expires_at;
+		volume->cb_expires_at = expires_at;
+	}
+	if (cb_v_check < op->cb_v_break)
+		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
 	return 0;
 }
 
@@ -225,99 +368,6 @@ static void afs_zap_data(struct afs_vnode *vnode)
 }
 
 /*
- * Check to see if we have a server currently serving this volume and that it
- * hasn't been reinitialised or dropped from the list.
- */
-static bool afs_check_server_good(struct afs_vnode *vnode)
-{
-	struct afs_server_list *slist;
-	struct afs_server *server;
-	bool good;
-	int i;
-
-	if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
-		return true;
-
-	rcu_read_lock();
-
-	slist = rcu_dereference(vnode->volume->servers);
-	for (i = 0; i < slist->nr_servers; i++) {
-		server = slist->servers[i].server;
-		if (server == vnode->cb_server) {
-			good = (vnode->cb_s_break == server->cb_s_break);
-			rcu_read_unlock();
-			return good;
-		}
-	}
-
-	rcu_read_unlock();
-	return false;
-}
-
-/*
- * Check the validity of a vnode/inode.
- */
-bool afs_check_validity(struct afs_vnode *vnode)
-{
-	enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
-	time64_t now = ktime_get_real_seconds();
-	unsigned int cb_break;
-	int seq;
-
-	do {
-		seq = read_seqbegin(&vnode->cb_lock);
-		cb_break = vnode->cb_break;
-
-		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-			if (vnode->cb_v_break != atomic_read(&vnode->volume->cb_v_break))
-				need_clear = afs_cb_break_for_v_break;
-			else if (!afs_check_server_good(vnode))
-				need_clear = afs_cb_break_for_s_reinit;
-			else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
-				need_clear = afs_cb_break_for_zap;
-			else if (vnode->cb_expires_at - 10 <= now)
-				need_clear = afs_cb_break_for_lapsed;
-		} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-			;
-		} else {
-			need_clear = afs_cb_break_no_promise;
-		}
-
-	} while (read_seqretry(&vnode->cb_lock, seq));
-
-	if (need_clear == afs_cb_break_no_break)
-		return true;
-
-	write_seqlock(&vnode->cb_lock);
-	if (need_clear == afs_cb_break_no_promise)
-		vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
-	else if (cb_break == vnode->cb_break)
-		__afs_break_callback(vnode, need_clear);
-	else
-		trace_afs_cb_miss(&vnode->fid, need_clear);
-	write_sequnlock(&vnode->cb_lock);
-	return false;
-}
-
-/*
- * Returns true if the pagecache is still valid.  Does not sleep.
- */
-bool afs_pagecache_valid(struct afs_vnode *vnode)
-{
-	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
-		if (vnode->netfs.inode.i_nlink)
-			clear_nlink(&vnode->netfs.inode);
-		return true;
-	}
-
-	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
-	    afs_check_validity(vnode))
-		return true;
-
-	return false;
-}
-
-/*
  * validate a vnode/inode
  * - there are several things we need to check
  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -328,23 +378,48 @@ bool afs_pagecache_valid(struct afs_vnode *vnode)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
+	struct afs_volume *volume = vnode->volume;
+	unsigned int cb_ro_snapshot, cb_scrub;
+	time64_t deadline = ktime_get_real_seconds() + 10;
+	bool zap = false, locked_vol = false;
 	int ret;
 
 	_enter("{v={%llx:%llu} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	if (afs_pagecache_valid(vnode))
-		goto valid;
+	if (afs_check_validity(vnode))
+		return 0;
 
-	down_write(&vnode->validate_lock);
+	ret = down_write_killable(&vnode->validate_lock);
+	if (ret < 0)
+		goto error;
+
+	/* Validate a volume after the v_break has changed or the volume
+	 * callback expired.  We only want to do this once per volume per
+	 * v_break change.  The actual work will be done when parsing the
+	 * status fetch reply.
+	 */
+	if (volume->cb_expires_at <= deadline ||
+	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
+		ret = mutex_lock_interruptible(&volume->cb_check_lock);
+		if (ret < 0)
+			goto error_unlock;
+		locked_vol = true;
+	}
 
-	/* if the promise has expired, we need to check the server again to get
-	 * a new promise - note that if the (parent) directory's metadata was
-	 * changed then the security may be different and we may no longer have
-	 * access */
-	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-		_debug("not promised");
+	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+	cb_scrub = atomic_read(&volume->cb_scrub);
+	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+	    vnode->cb_scrub	  != cb_scrub)
+		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
+
+	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+	    vnode->cb_scrub	  != cb_scrub ||
+	    volume->cb_expires_at <= deadline ||
+	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+	    atomic64_read(&vnode->cb_expires_at) <= deadline
+	    ) {
 		ret = afs_fetch_status(vnode, key, false, NULL);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
@@ -353,9 +428,26 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 			}
 			goto error_unlock;
 		}
+
 		_debug("new promise [fl=%lx]", vnode->flags);
 	}
 
+	/* We can drop the volume lock now as. */
+	if (locked_vol) {
+		mutex_unlock(&volume->cb_check_lock);
+		locked_vol = false;
+	}
+
+	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+	cb_scrub = atomic_read(&volume->cb_scrub);
+	_debug("vnode inval %x==%x %x==%x",
+	       vnode->cb_ro_snapshot, cb_ro_snapshot,
+	       vnode->cb_scrub, cb_scrub);
+	if (vnode->cb_scrub != cb_scrub)
+		zap = true;
+	vnode->cb_ro_snapshot = cb_ro_snapshot;
+	vnode->cb_scrub = cb_scrub;
+
 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		_debug("file already deleted");
 		ret = -ESTALE;
@@ -364,15 +456,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 
 	/* if the vnode's data version number changed then its contents are
 	 * different */
-	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+	if (zap)
 		afs_zap_data(vnode);
 	up_write(&vnode->validate_lock);
-valid:
 	_leave(" = 0");
 	return 0;
 
 error_unlock:
+	if (locked_vol)
+		mutex_unlock(&volume->cb_check_lock);
 	up_write(&vnode->validate_lock);
+error:
 	_leave(" = %d", ret);
 	return ret;
 }