1// SPDX-License-Identifier: GPL-2.0-or-later
2/* vnode and volume validity verification.
3 *
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/sched.h>
11#include "internal.h"
12
13/*
14 * Data validation is managed through a number of mechanisms from the server:
15 *
16 *  (1) On first contact with a server (such as if it has just been rebooted),
17 *      the server sends us a CB.InitCallBackState* request.
18 *
19 *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20 *      calls, the server maintains a time-limited per-vnode promise that it
21 *      will send us a CB.CallBack request if a third party alters the vnodes
22 *      accessed.
23 *
24 *      Note that a vnode-level callbacks may also be sent for other reasons,
25 *      such as filelock release.
26 *
27 *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28 *      calls, each server maintains a time-limited per-volume promise that it
29 *      will send us a CB.CallBack request if the RO volume is updated to a
30 *      snapshot of the RW volume ("vos release").  This is an atomic event
31 *      that cuts over all instances of the RO volume across multiple servers
32 *      simultaneously.
33 *
34 *	Note that a volume-level callbacks may also be sent for other reasons,
35 *	such as the volumeserver taking over control of the volume from the
36 *	fileserver.
37 *
38 *	Note also that each server maintains an independent time limit on an
39 *	independent callback.
40 *
41 *  (4) Certain RPC calls include a volume information record "VolSync" in
42 *      their reply.  This contains a creation date for the volume that should
43 *      remain unchanged for a RW volume (but will be changed if the volume is
44 *      restored from backup) or will be bumped to the time of snapshotting
45 *      when a RO volume is released.
46 *
47 * In order to track this events, the following are provided:
48 *
49 *	->cb_v_break.  A counter of events that might mean that the contents of
50 *	a volume have been altered since we last checked a vnode.
51 *
52 *	->cb_v_check.  A counter of the number of events that we've sent a
53 *	query to the server for.  Everything's up to date if this equals
54 *	cb_v_break.
55 *
56 *	->cb_scrub.  A counter of the number of regression events for which we
57 *	have to completely wipe the cache.
58 *
59 *	->cb_ro_snapshot.  A counter of the number of times that we've
60 *      recognised that a RO volume has been updated.
61 *
62 *	->cb_break.  A counter of events that might mean that the contents of a
63 *      vnode have been altered.
64 *
65 *	->cb_expires_at.  The time at which the callback promise expires or
66 *      AFS_NO_CB_PROMISE if we have no promise.
67 *
68 * The way we manage things is:
69 *
70 *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71 *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72 *      volume and volume's server record.
73 *
74 *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75 *	callback break on all the volumes that have been using that volume
76 *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
77 *
78 *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79 *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
80 *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
81 *	force reentry to the filesystem for revalidation.
82 *
83 *  (4) When entering the filesystem, we call afs_validate() to check the
84 *	validity of a vnode.  This first checks to see if ->cb_v_check and
85 *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86 *	exclusively and perform an FS.FetchStatus on the vnode.
87 *
88 *	After checking the volume, we check the vnode.  If there's a mismatch
89 *	between the volume counters and the vnode's mirrors of those counters,
90 *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91 *
92 *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93 *      parsed:
94 *
95 *	(A) If the Creation timestamp has changed on a RW volume or regressed
96 *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
97 *	    RO volume, we assume "vos release" happened and try to increment
98 *	    ->cb_ro_snapshot.
99 *
100 *      (B) If the Update timestamp has regressed, we try to increment
101 *	    ->cb_scrub.
102 *
103 *      Note that in both of these cases, we only do the increment if we can
104 *      cmpxchg the value of the timestamp from the value we noted before the
105 *      op.  This tries to prevent parallel ops from fighting one another.
106 *
107 *	volume->cb_v_check is then set to ->cb_v_break.
108 *
109 *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110 *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111 *	the volume and the volume's server record.
112 *
113 *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114 *      the vnode.
115 */
116
117/*
118 * Check the validity of a vnode/inode and its parent volume.
119 */
120bool afs_check_validity(const struct afs_vnode *vnode)
121{
122	const struct afs_volume *volume = vnode->volume;
123	time64_t deadline = ktime_get_real_seconds() + 10;
124
125	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
126		return true;
127
128	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
129	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
130	    volume->cb_expires_at <= deadline ||
131	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
132	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
133	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
134		_debug("inval");
135		return false;
136	}
137
138	return true;
139}
140
141/*
142 * See if the server we've just talked to is currently excluded.
143 */
144static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
145{
146	const struct afs_server_entry *se;
147	const struct afs_server_list *slist;
148	bool is_excluded = true;
149	int i;
150
151	rcu_read_lock();
152
153	slist = rcu_dereference(volume->servers);
154	for (i = 0; i < slist->nr_servers; i++) {
155		se = &slist->servers[i];
156		if (op->server == se->server) {
157			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
158			break;
159		}
160	}
161
162	rcu_read_unlock();
163	return is_excluded;
164}
165
166/*
167 * Update the volume's server list when the creation time changes and see if
168 * the server we've just talked to is currently excluded.
169 */
170static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
171{
172	int ret;
173
174	if (__afs_is_server_excluded(op, volume))
175		return 1;
176
177	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
178	ret = afs_check_volume_status(op->volume, op);
179	if (ret < 0)
180		return ret;
181
182	return __afs_is_server_excluded(op, volume);
183}
184
185/*
186 * Handle a change to the volume creation time in the VolSync record.
187 */
188static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
189{
190	unsigned int snap;
191	time64_t cur = volume->creation_time;
192	time64_t old = op->pre_volsync.creation;
193	time64_t new = op->volsync.creation;
194	int ret;
195
196	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
197
198	if (cur == TIME64_MIN) {
199		volume->creation_time = new;
200		return 0;
201	}
202
203	if (new == cur)
204		return 0;
205
206	/* Try to advance the creation timestamp from what we had before the
207	 * operation to what we got back from the server.  This should
208	 * hopefully ensure that in a race between multiple operations only one
209	 * of them will do this.
210	 */
211	if (cur != old)
212		return 0;
213
214	/* If the creation time changes in an unexpected way, we need to scrub
215	 * our caches.  For a RW vol, this will only change if the volume is
216	 * restored from a backup; for a RO/Backup vol, this will advance when
217	 * the volume is updated to a new snapshot (eg. "vos release").
218	 */
219	if (volume->type == AFSVL_RWVOL)
220		goto regressed;
221	if (volume->type == AFSVL_BACKVOL) {
222		if (new < old)
223			goto regressed;
224		goto advance;
225	}
226
227	/* We have an RO volume, we need to query the VL server and look at the
228	 * server flags to see if RW->RO replication is in progress.
229	 */
230	ret = afs_is_server_excluded(op, volume);
231	if (ret < 0)
232		return ret;
233	if (ret > 0) {
234		snap = atomic_read(&volume->cb_ro_snapshot);
235		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
236		return ret;
237	}
238
239advance:
240	snap = atomic_inc_return(&volume->cb_ro_snapshot);
241	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
242	volume->creation_time = new;
243	return 0;
244
245regressed:
246	atomic_inc(&volume->cb_scrub);
247	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
248	volume->creation_time = new;
249	return 0;
250}
251
252/*
253 * Handle a change to the volume update time in the VolSync record.
254 */
255static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
256{
257	enum afs_cb_break_reason reason = afs_cb_break_no_break;
258	time64_t cur = volume->update_time;
259	time64_t old = op->pre_volsync.update;
260	time64_t new = op->volsync.update;
261
262	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
263
264	if (cur == TIME64_MIN) {
265		volume->update_time = new;
266		return;
267	}
268
269	if (new == cur)
270		return;
271
272	/* If the volume update time changes in an unexpected way, we need to
273	 * scrub our caches.  For a RW vol, this will advance on every
274	 * modification op; for a RO/Backup vol, this will advance when the
275	 * volume is updated to a new snapshot (eg. "vos release").
276	 */
277	if (new < old)
278		reason = afs_cb_break_for_update_regress;
279
280	/* Try to advance the update timestamp from what we had before the
281	 * operation to what we got back from the server.  This should
282	 * hopefully ensure that in a race between multiple operations only one
283	 * of them will do this.
284	 */
285	if (cur == old) {
286		if (reason == afs_cb_break_for_update_regress) {
287			atomic_inc(&volume->cb_scrub);
288			trace_afs_cb_v_break(volume->vid, 0, reason);
289		}
290		volume->update_time = new;
291	}
292}
293
294static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
295{
296	int ret = 0;
297
298	if (likely(op->volsync.creation == volume->creation_time &&
299		   op->volsync.update == volume->update_time))
300		return 0;
301
302	mutex_lock(&volume->volsync_lock);
303	if (op->volsync.creation != volume->creation_time) {
304		ret = afs_update_volume_creation_time(op, volume);
305		if (ret < 0)
306			goto out;
307	}
308	if (op->volsync.update != volume->update_time)
309		afs_update_volume_update_time(op, volume);
310out:
311	mutex_unlock(&volume->volsync_lock);
312	return ret;
313}
314
315/*
316 * Update the state of a volume, including recording the expiration time of the
317 * callback promise.  Returns 1 to redo the operation from the start.
318 */
319int afs_update_volume_state(struct afs_operation *op)
320{
321	struct afs_server_list *slist = op->server_list;
322	struct afs_server_entry *se = &slist->servers[op->server_index];
323	struct afs_callback *cb = &op->file[0].scb.callback;
324	struct afs_volume *volume = op->volume;
325	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
326	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
327	int ret;
328
329	_enter("%llx", op->volume->vid);
330
331	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
332		ret = afs_update_volume_times(op, volume);
333		if (ret != 0) {
334			_leave(" = %d", ret);
335			return ret;
336		}
337	}
338
339	if (op->cb_v_break == cb_v_break &&
340	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
341		time64_t expires_at = cb->expires_at;
342
343		if (!op->file[0].scb.have_cb)
344			expires_at = op->file[1].scb.callback.expires_at;
345
346		se->cb_expires_at = expires_at;
347		volume->cb_expires_at = expires_at;
348	}
349	if (cb_v_check < op->cb_v_break)
350		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
351	return 0;
352}
353
354/*
355 * mark the data attached to an inode as obsolete due to a write on the server
356 * - might also want to ditch all the outstanding writes and dirty pages
357 */
358static void afs_zap_data(struct afs_vnode *vnode)
359{
360	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
361
362	afs_invalidate_cache(vnode, 0);
363
364	/* nuke all the non-dirty pages that aren't locked, mapped or being
365	 * written back in a regular file and completely discard the pages in a
366	 * directory or symlink */
367	if (S_ISREG(vnode->netfs.inode.i_mode))
368		invalidate_remote_inode(&vnode->netfs.inode);
369	else
370		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
371}
372
373/*
374 * validate a vnode/inode
375 * - there are several things we need to check
376 *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
377 *     symlink)
378 *   - parent dir metadata changed (security changes)
379 *   - dentry data changed (write, truncate)
380 *   - dentry metadata changed (security changes)
381 */
382int afs_validate(struct afs_vnode *vnode, struct key *key)
383{
384	struct afs_volume *volume = vnode->volume;
385	unsigned int cb_ro_snapshot, cb_scrub;
386	time64_t deadline = ktime_get_real_seconds() + 10;
387	bool zap = false, locked_vol = false;
388	int ret;
389
390	_enter("{v={%llx:%llu} fl=%lx},%x",
391	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
392	       key_serial(key));
393
394	if (afs_check_validity(vnode))
395		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
396
397	ret = down_write_killable(&vnode->validate_lock);
398	if (ret < 0)
399		goto error;
400
401	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
402		ret = -ESTALE;
403		goto error_unlock;
404	}
405
406	/* Validate a volume after the v_break has changed or the volume
407	 * callback expired.  We only want to do this once per volume per
408	 * v_break change.  The actual work will be done when parsing the
409	 * status fetch reply.
410	 */
411	if (volume->cb_expires_at <= deadline ||
412	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
413		ret = mutex_lock_interruptible(&volume->cb_check_lock);
414		if (ret < 0)
415			goto error_unlock;
416		locked_vol = true;
417	}
418
419	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
420	cb_scrub = atomic_read(&volume->cb_scrub);
421	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
422	    vnode->cb_scrub	  != cb_scrub)
423		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
424
425	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
426	    vnode->cb_scrub	  != cb_scrub ||
427	    volume->cb_expires_at <= deadline ||
428	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
429	    atomic64_read(&vnode->cb_expires_at) <= deadline
430	    ) {
431		ret = afs_fetch_status(vnode, key, false, NULL);
432		if (ret < 0) {
433			if (ret == -ENOENT) {
434				set_bit(AFS_VNODE_DELETED, &vnode->flags);
435				ret = -ESTALE;
436			}
437			goto error_unlock;
438		}
439
440		_debug("new promise [fl=%lx]", vnode->flags);
441	}
442
443	/* We can drop the volume lock now as. */
444	if (locked_vol) {
445		mutex_unlock(&volume->cb_check_lock);
446		locked_vol = false;
447	}
448
449	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
450	cb_scrub = atomic_read(&volume->cb_scrub);
451	_debug("vnode inval %x==%x %x==%x",
452	       vnode->cb_ro_snapshot, cb_ro_snapshot,
453	       vnode->cb_scrub, cb_scrub);
454	if (vnode->cb_scrub != cb_scrub)
455		zap = true;
456	vnode->cb_ro_snapshot = cb_ro_snapshot;
457	vnode->cb_scrub = cb_scrub;
458
459	/* if the vnode's data version number changed then its contents are
460	 * different */
461	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
462	if (zap)
463		afs_zap_data(vnode);
464	up_write(&vnode->validate_lock);
465	_leave(" = 0");
466	return 0;
467
468error_unlock:
469	if (locked_vol)
470		mutex_unlock(&volume->cb_check_lock);
471	up_write(&vnode->validate_lock);
472error:
473	_leave(" = %d", ret);
474	return ret;
475}
476