// SPDX-License-Identifier: GPL-2.0-or-later
/* vnode and volume validity verification.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include "internal.h"

/*
 * See if the server we've just talked to is currently excluded.
 */
static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{
	const struct afs_server_entry *se;
	const struct afs_server_list *slist;
	bool is_excluded = true;
	int i;

	rcu_read_lock();

	slist = rcu_dereference(volume->servers);
	for (i = 0; i < slist->nr_servers; i++) {
		se = &slist->servers[i];
		if (op->server == se->server) {
			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
			break;
		}
	}

	rcu_read_unlock();
	return is_excluded;
}

/*
 * Update the volume's server list when the creation time changes and see if
 * the server we've just talked to is currently excluded.
 */
static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{
	int ret;

	if (__afs_is_server_excluded(op, volume))
		return 1;

	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
	ret = afs_check_volume_status(op->volume, op);
	if (ret < 0)
		return ret;

	return __afs_is_server_excluded(op, volume);
}

/*
 * Handle a change to the volume creation time in the VolSync record.
 */
static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
{
	unsigned int snap;
	time64_t cur = volume->creation_time;
	time64_t old = op->pre_volsync.creation;
	time64_t new = op->volsync.creation;
	int ret;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
		volume->creation_time = new;
		return 0;
	}

	if (new == cur)
		return 0;

	/* Try to advance the creation timestamp from what we had before the
	 * operation to what we got back from the server.  This should
	 * hopefully ensure that in a race between multiple operations only one
	 * of them will do this.
	 */
	if (cur != old)
		return 0;

	/* If the creation time changes in an unexpected way, we need to scrub
	 * our caches.  For a RW vol, this will only change if the volume is
	 * restored from a backup; for a RO/Backup vol, this will advance when
	 * the volume is updated to a new snapshot (eg. "vos release").
	 */
	if (volume->type == AFSVL_RWVOL)
		goto regressed;
	if (volume->type == AFSVL_BACKVOL) {
		if (new < old)
			goto regressed;
		goto advance;
	}

	/* We have an RO volume, we need to query the VL server and look at the
	 * server flags to see if RW->RO replication is in progress.
	 */
	ret = afs_is_server_excluded(op, volume);
	if (ret < 0)
		return ret;
	if (ret > 0) {
		snap = atomic_read(&volume->cb_ro_snapshot);
		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
		return ret;
	}

advance:
	snap = atomic_inc_return(&volume->cb_ro_snapshot);
	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
	volume->creation_time = new;
	return 0;

regressed:
	atomic_inc(&volume->cb_scrub);
	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
	volume->creation_time = new;
	return 0;
}

/*
 * Handle a change to the volume update time in the VolSync record.
 */
static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
{
	enum afs_cb_break_reason reason = afs_cb_break_no_break;
	time64_t cur = volume->update_time;
	time64_t old = op->pre_volsync.update;
	time64_t new = op->volsync.update;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
		volume->update_time = new;
		return;
	}

	if (new == cur)
		return;

	/* If the volume update time changes in an unexpected way, we need to
	 * scrub our caches.  For a RW vol, this will advance on every
	 * modification op; for a RO/Backup vol, this will advance when the
	 * volume is updated to a new snapshot (eg. "vos release").
	 */
	if (new < old)
		reason = afs_cb_break_for_update_regress;

	/* Try to advance the update timestamp from what we had before the
	 * operation to what we got back from the server.  This should
	 * hopefully ensure that in a race between multiple operations only one
	 * of them will do this.
	 */
	if (cur == old) {
		if (reason == afs_cb_break_for_update_regress) {
			atomic_inc(&volume->cb_scrub);
			trace_afs_cb_v_break(volume->vid, 0, reason);
		}
		volume->update_time = new;
	}
}

static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
{
	int ret = 0;

	if (likely(op->volsync.creation == volume->creation_time &&
		   op->volsync.update == volume->update_time))
		return 0;

	mutex_lock(&volume->volsync_lock);
	if (op->volsync.creation != volume->creation_time) {
		ret = afs_update_volume_creation_time(op, volume);
		if (ret < 0)
			goto out;
	}
	if (op->volsync.update != volume->update_time)
		afs_update_volume_update_time(op, volume);
out:
	mutex_unlock(&volume->volsync_lock);
	return ret;
}

/*
 * Update the state of a volume.  Returns 1 to redo the operation from the start.
 */
int afs_update_volume_state(struct afs_operation *op)
{
	struct afs_volume *volume = op->volume;
	int ret;

	_enter("%llx", op->volume->vid);

	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
		ret = afs_update_volume_times(op, volume);
		if (ret != 0) {
			_leave(" = %d", ret);
			return ret;
		}
	}

	return 0;
}

/*
 * mark the data attached to an inode as obsolete due to a write on the server
 * - might also want to ditch all the outstanding writes and dirty pages
 */
static void afs_zap_data(struct afs_vnode *vnode)
{
	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);

	afs_invalidate_cache(vnode, 0);

	/* nuke all the non-dirty pages that aren't locked, mapped or being
	 * written back in a regular file and completely discard the pages in a
	 * directory or symlink */
	if (S_ISREG(vnode->netfs.inode.i_mode))
		invalidate_remote_inode(&vnode->netfs.inode);
	else
		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
}

/*
 * Check to see if we have a server currently serving this volume and that it
 * hasn't been reinitialised or dropped from the list.
 */
static bool afs_check_server_good(struct afs_vnode *vnode)
{
	struct afs_server_list *slist;
	struct afs_server *server;
	bool good;
	int i;

	if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
		return true;

	rcu_read_lock();

	slist = rcu_dereference(vnode->volume->servers);
	for (i = 0; i < slist->nr_servers; i++) {
		server = slist->servers[i].server;
		if (server == vnode->cb_server) {
			good = (vnode->cb_s_break == server->cb_s_break);
			rcu_read_unlock();
			return good;
		}
	}

	rcu_read_unlock();
	return false;
}

/*
 * Check the validity of a vnode/inode.
 */
bool afs_check_validity(struct afs_vnode *vnode)
{
	enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
	time64_t now = ktime_get_real_seconds();
	unsigned int cb_break;
	int seq;

	do {
		seq = read_seqbegin(&vnode->cb_lock);
		cb_break = vnode->cb_break;

		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
			if (vnode->cb_v_break != atomic_read(&vnode->volume->cb_v_break))
				need_clear = afs_cb_break_for_v_break;
			else if (!afs_check_server_good(vnode))
				need_clear = afs_cb_break_for_s_reinit;
			else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
				need_clear = afs_cb_break_for_zap;
			else if (vnode->cb_expires_at - 10 <= now)
				need_clear = afs_cb_break_for_lapsed;
		} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
			;
		} else {
			need_clear = afs_cb_break_no_promise;
		}

	} while (read_seqretry(&vnode->cb_lock, seq));

	if (need_clear == afs_cb_break_no_break)
		return true;

	write_seqlock(&vnode->cb_lock);
	if (need_clear == afs_cb_break_no_promise)
		vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
	else if (cb_break == vnode->cb_break)
		__afs_break_callback(vnode, need_clear);
	else
		trace_afs_cb_miss(&vnode->fid, need_clear);
	write_sequnlock(&vnode->cb_lock);
	return false;
}

/*
 * Returns true if the pagecache is still valid.  Does not sleep.
 */
bool afs_pagecache_valid(struct afs_vnode *vnode)
{
	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
		if (vnode->netfs.inode.i_nlink)
			clear_nlink(&vnode->netfs.inode);
		return true;
	}

	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
	    afs_check_validity(vnode))
		return true;

	return false;
}

/*
 * validate a vnode/inode
 * - there are several things we need to check
 *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
 *     symlink)
 *   - parent dir metadata changed (security changes)
 *   - dentry data changed (write, truncate)
 *   - dentry metadata changed (security changes)
 */
int afs_validate(struct afs_vnode *vnode, struct key *key)
{
	int ret;

	_enter("{v={%llx:%llu} fl=%lx},%x",
	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
	       key_serial(key));

	if (afs_pagecache_valid(vnode))
		goto valid;

	down_write(&vnode->validate_lock);

	/* if the promise has expired, we need to check the server again to get
	 * a new promise - note that if the (parent) directory's metadata was
	 * changed then the security may be different and we may no longer have
	 * access */
	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
		_debug("not promised");
		ret = afs_fetch_status(vnode, key, false, NULL);
		if (ret < 0) {
			if (ret == -ENOENT) {
				set_bit(AFS_VNODE_DELETED, &vnode->flags);
				ret = -ESTALE;
			}
			goto error_unlock;
		}
		_debug("new promise [fl=%lx]", vnode->flags);
	}

	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
		_debug("file already deleted");
		ret = -ESTALE;
		goto error_unlock;
	}

	/* if the vnode's data version number changed then its contents are
	 * different */
	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
		afs_zap_data(vnode);
	up_write(&vnode->validate_lock);
valid:
	_leave(" = 0");
	return 0;

error_unlock:
	up_write(&vnode->validate_lock);
	_leave(" = %d", ret);
	return ret;
}