1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* vnode and volume validity verification.
3  *
4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/sched.h>
11 #include "internal.h"
12 
13 /*
14  * Data validation is managed through a number of mechanisms from the server:
15  *
16  *  (1) On first contact with a server (such as if it has just been rebooted),
17  *      the server sends us a CB.InitCallBackState* request.
18  *
19  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20  *      calls, the server maintains a time-limited per-vnode promise that it
21  *      will send us a CB.CallBack request if a third party alters the vnodes
22  *      accessed.
23  *
24  *      Note that a vnode-level callbacks may also be sent for other reasons,
25  *      such as filelock release.
26  *
27  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28  *      calls, each server maintains a time-limited per-volume promise that it
29  *      will send us a CB.CallBack request if the RO volume is updated to a
30  *      snapshot of the RW volume ("vos release").  This is an atomic event
31  *      that cuts over all instances of the RO volume across multiple servers
32  *      simultaneously.
33  *
34  *	Note that a volume-level callbacks may also be sent for other reasons,
35  *	such as the volumeserver taking over control of the volume from the
36  *	fileserver.
37  *
38  *	Note also that each server maintains an independent time limit on an
39  *	independent callback.
40  *
41  *  (4) Certain RPC calls include a volume information record "VolSync" in
42  *      their reply.  This contains a creation date for the volume that should
43  *      remain unchanged for a RW volume (but will be changed if the volume is
44  *      restored from backup) or will be bumped to the time of snapshotting
45  *      when a RO volume is released.
46  *
47  * In order to track this events, the following are provided:
48  *
49  *	->cb_v_break.  A counter of events that might mean that the contents of
50  *	a volume have been altered since we last checked a vnode.
51  *
52  *	->cb_v_check.  A counter of the number of events that we've sent a
53  *	query to the server for.  Everything's up to date if this equals
54  *	cb_v_break.
55  *
56  *	->cb_scrub.  A counter of the number of regression events for which we
57  *	have to completely wipe the cache.
58  *
59  *	->cb_ro_snapshot.  A counter of the number of times that we've
60  *      recognised that a RO volume has been updated.
61  *
62  *	->cb_break.  A counter of events that might mean that the contents of a
63  *      vnode have been altered.
64  *
65  *	->cb_expires_at.  The time at which the callback promise expires or
66  *      AFS_NO_CB_PROMISE if we have no promise.
67  *
68  * The way we manage things is:
69  *
70  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72  *      volume and volume's server record.
73  *
74  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75  *	callback break on all the volumes that have been using that volume
76  *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
77  *
78  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79  *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
80  *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
81  *	force reentry to the filesystem for revalidation.
82  *
83  *  (4) When entering the filesystem, we call afs_validate() to check the
84  *	validity of a vnode.  This first checks to see if ->cb_v_check and
85  *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86  *	exclusively and perform an FS.FetchStatus on the vnode.
87  *
88  *	After checking the volume, we check the vnode.  If there's a mismatch
89  *	between the volume counters and the vnode's mirrors of those counters,
90  *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91  *
92  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93  *      parsed:
94  *
95  *	(A) If the Creation timestamp has changed on a RW volume or regressed
96  *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
97  *	    RO volume, we assume "vos release" happened and try to increment
98  *	    ->cb_ro_snapshot.
99  *
100  *      (B) If the Update timestamp has regressed, we try to increment
101  *	    ->cb_scrub.
102  *
103  *      Note that in both of these cases, we only do the increment if we can
104  *      cmpxchg the value of the timestamp from the value we noted before the
105  *      op.  This tries to prevent parallel ops from fighting one another.
106  *
107  *	volume->cb_v_check is then set to ->cb_v_break.
108  *
109  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110  *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111  *	the volume and the volume's server record.
112  *
113  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114  *      the vnode.
115  */
116 
117 /*
118  * Check the validity of a vnode/inode and its parent volume.
119  */
afs_check_validity(const struct afs_vnode * vnode)120 bool afs_check_validity(const struct afs_vnode *vnode)
121 {
122 	const struct afs_volume *volume = vnode->volume;
123 	time64_t deadline = ktime_get_real_seconds() + 10;
124 
125 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
126 		return true;
127 
128 	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
129 	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
130 	    volume->cb_expires_at <= deadline ||
131 	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
132 	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
133 	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
134 		_debug("inval");
135 		return false;
136 	}
137 
138 	return true;
139 }
140 
141 /*
142  * See if the server we've just talked to is currently excluded.
143  */
__afs_is_server_excluded(struct afs_operation * op,struct afs_volume * volume)144 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
145 {
146 	const struct afs_server_entry *se;
147 	const struct afs_server_list *slist;
148 	bool is_excluded = true;
149 	int i;
150 
151 	rcu_read_lock();
152 
153 	slist = rcu_dereference(volume->servers);
154 	for (i = 0; i < slist->nr_servers; i++) {
155 		se = &slist->servers[i];
156 		if (op->server == se->server) {
157 			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
158 			break;
159 		}
160 	}
161 
162 	rcu_read_unlock();
163 	return is_excluded;
164 }
165 
166 /*
167  * Update the volume's server list when the creation time changes and see if
168  * the server we've just talked to is currently excluded.
169  */
afs_is_server_excluded(struct afs_operation * op,struct afs_volume * volume)170 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
171 {
172 	int ret;
173 
174 	if (__afs_is_server_excluded(op, volume))
175 		return 1;
176 
177 	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
178 	ret = afs_check_volume_status(op->volume, op);
179 	if (ret < 0)
180 		return ret;
181 
182 	return __afs_is_server_excluded(op, volume);
183 }
184 
185 /*
186  * Handle a change to the volume creation time in the VolSync record.
187  */
afs_update_volume_creation_time(struct afs_operation * op,struct afs_volume * volume)188 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
189 {
190 	unsigned int snap;
191 	time64_t cur = volume->creation_time;
192 	time64_t old = op->pre_volsync.creation;
193 	time64_t new = op->volsync.creation;
194 	int ret;
195 
196 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
197 
198 	if (cur == TIME64_MIN) {
199 		volume->creation_time = new;
200 		return 0;
201 	}
202 
203 	if (new == cur)
204 		return 0;
205 
206 	/* Try to advance the creation timestamp from what we had before the
207 	 * operation to what we got back from the server.  This should
208 	 * hopefully ensure that in a race between multiple operations only one
209 	 * of them will do this.
210 	 */
211 	if (cur != old)
212 		return 0;
213 
214 	/* If the creation time changes in an unexpected way, we need to scrub
215 	 * our caches.  For a RW vol, this will only change if the volume is
216 	 * restored from a backup; for a RO/Backup vol, this will advance when
217 	 * the volume is updated to a new snapshot (eg. "vos release").
218 	 */
219 	if (volume->type == AFSVL_RWVOL)
220 		goto regressed;
221 	if (volume->type == AFSVL_BACKVOL) {
222 		if (new < old)
223 			goto regressed;
224 		goto advance;
225 	}
226 
227 	/* We have an RO volume, we need to query the VL server and look at the
228 	 * server flags to see if RW->RO replication is in progress.
229 	 */
230 	ret = afs_is_server_excluded(op, volume);
231 	if (ret < 0)
232 		return ret;
233 	if (ret > 0) {
234 		snap = atomic_read(&volume->cb_ro_snapshot);
235 		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
236 		return ret;
237 	}
238 
239 advance:
240 	snap = atomic_inc_return(&volume->cb_ro_snapshot);
241 	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
242 	volume->creation_time = new;
243 	return 0;
244 
245 regressed:
246 	atomic_inc(&volume->cb_scrub);
247 	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
248 	volume->creation_time = new;
249 	return 0;
250 }
251 
252 /*
253  * Handle a change to the volume update time in the VolSync record.
254  */
afs_update_volume_update_time(struct afs_operation * op,struct afs_volume * volume)255 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
256 {
257 	enum afs_cb_break_reason reason = afs_cb_break_no_break;
258 	time64_t cur = volume->update_time;
259 	time64_t old = op->pre_volsync.update;
260 	time64_t new = op->volsync.update;
261 
262 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
263 
264 	if (cur == TIME64_MIN) {
265 		volume->update_time = new;
266 		return;
267 	}
268 
269 	if (new == cur)
270 		return;
271 
272 	/* If the volume update time changes in an unexpected way, we need to
273 	 * scrub our caches.  For a RW vol, this will advance on every
274 	 * modification op; for a RO/Backup vol, this will advance when the
275 	 * volume is updated to a new snapshot (eg. "vos release").
276 	 */
277 	if (new < old)
278 		reason = afs_cb_break_for_update_regress;
279 
280 	/* Try to advance the update timestamp from what we had before the
281 	 * operation to what we got back from the server.  This should
282 	 * hopefully ensure that in a race between multiple operations only one
283 	 * of them will do this.
284 	 */
285 	if (cur == old) {
286 		if (reason == afs_cb_break_for_update_regress) {
287 			atomic_inc(&volume->cb_scrub);
288 			trace_afs_cb_v_break(volume->vid, 0, reason);
289 		}
290 		volume->update_time = new;
291 	}
292 }
293 
afs_update_volume_times(struct afs_operation * op,struct afs_volume * volume)294 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
295 {
296 	int ret = 0;
297 
298 	if (likely(op->volsync.creation == volume->creation_time &&
299 		   op->volsync.update == volume->update_time))
300 		return 0;
301 
302 	mutex_lock(&volume->volsync_lock);
303 	if (op->volsync.creation != volume->creation_time) {
304 		ret = afs_update_volume_creation_time(op, volume);
305 		if (ret < 0)
306 			goto out;
307 	}
308 	if (op->volsync.update != volume->update_time)
309 		afs_update_volume_update_time(op, volume);
310 out:
311 	mutex_unlock(&volume->volsync_lock);
312 	return ret;
313 }
314 
315 /*
316  * Update the state of a volume, including recording the expiration time of the
317  * callback promise.  Returns 1 to redo the operation from the start.
318  */
afs_update_volume_state(struct afs_operation * op)319 int afs_update_volume_state(struct afs_operation *op)
320 {
321 	struct afs_server_list *slist = op->server_list;
322 	struct afs_server_entry *se = &slist->servers[op->server_index];
323 	struct afs_callback *cb = &op->file[0].scb.callback;
324 	struct afs_volume *volume = op->volume;
325 	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
326 	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
327 	int ret;
328 
329 	_enter("%llx", op->volume->vid);
330 
331 	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
332 		ret = afs_update_volume_times(op, volume);
333 		if (ret != 0) {
334 			_leave(" = %d", ret);
335 			return ret;
336 		}
337 	}
338 
339 	if (op->cb_v_break == cb_v_break &&
340 	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
341 		time64_t expires_at = cb->expires_at;
342 
343 		if (!op->file[0].scb.have_cb)
344 			expires_at = op->file[1].scb.callback.expires_at;
345 
346 		se->cb_expires_at = expires_at;
347 		volume->cb_expires_at = expires_at;
348 	}
349 	if (cb_v_check < op->cb_v_break)
350 		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
351 	return 0;
352 }
353 
354 /*
355  * mark the data attached to an inode as obsolete due to a write on the server
356  * - might also want to ditch all the outstanding writes and dirty pages
357  */
afs_zap_data(struct afs_vnode * vnode)358 static void afs_zap_data(struct afs_vnode *vnode)
359 {
360 	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
361 
362 	afs_invalidate_cache(vnode, 0);
363 
364 	/* nuke all the non-dirty pages that aren't locked, mapped or being
365 	 * written back in a regular file and completely discard the pages in a
366 	 * directory or symlink */
367 	if (S_ISREG(vnode->netfs.inode.i_mode))
368 		filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
369 	else
370 		filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
371 }
372 
373 /*
374  * validate a vnode/inode
375  * - there are several things we need to check
376  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
377  *     symlink)
378  *   - parent dir metadata changed (security changes)
379  *   - dentry data changed (write, truncate)
380  *   - dentry metadata changed (security changes)
381  */
afs_validate(struct afs_vnode * vnode,struct key * key)382 int afs_validate(struct afs_vnode *vnode, struct key *key)
383 {
384 	struct afs_volume *volume = vnode->volume;
385 	unsigned int cb_ro_snapshot, cb_scrub;
386 	time64_t deadline = ktime_get_real_seconds() + 10;
387 	bool zap = false, locked_vol = false;
388 	int ret;
389 
390 	_enter("{v={%llx:%llu} fl=%lx},%x",
391 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
392 	       key_serial(key));
393 
394 	if (afs_check_validity(vnode))
395 		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
396 
397 	ret = down_write_killable(&vnode->validate_lock);
398 	if (ret < 0)
399 		goto error;
400 
401 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
402 		ret = -ESTALE;
403 		goto error_unlock;
404 	}
405 
406 	/* Validate a volume after the v_break has changed or the volume
407 	 * callback expired.  We only want to do this once per volume per
408 	 * v_break change.  The actual work will be done when parsing the
409 	 * status fetch reply.
410 	 */
411 	if (volume->cb_expires_at <= deadline ||
412 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
413 		ret = mutex_lock_interruptible(&volume->cb_check_lock);
414 		if (ret < 0)
415 			goto error_unlock;
416 		locked_vol = true;
417 	}
418 
419 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
420 	cb_scrub = atomic_read(&volume->cb_scrub);
421 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
422 	    vnode->cb_scrub	  != cb_scrub)
423 		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
424 
425 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
426 	    vnode->cb_scrub	  != cb_scrub ||
427 	    volume->cb_expires_at <= deadline ||
428 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
429 	    atomic64_read(&vnode->cb_expires_at) <= deadline
430 	    ) {
431 		ret = afs_fetch_status(vnode, key, false, NULL);
432 		if (ret < 0) {
433 			if (ret == -ENOENT) {
434 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
435 				ret = -ESTALE;
436 			}
437 			goto error_unlock;
438 		}
439 
440 		_debug("new promise [fl=%lx]", vnode->flags);
441 	}
442 
443 	/* We can drop the volume lock now as. */
444 	if (locked_vol) {
445 		mutex_unlock(&volume->cb_check_lock);
446 		locked_vol = false;
447 	}
448 
449 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
450 	cb_scrub = atomic_read(&volume->cb_scrub);
451 	_debug("vnode inval %x==%x %x==%x",
452 	       vnode->cb_ro_snapshot, cb_ro_snapshot,
453 	       vnode->cb_scrub, cb_scrub);
454 	if (vnode->cb_scrub != cb_scrub)
455 		zap = true;
456 	vnode->cb_ro_snapshot = cb_ro_snapshot;
457 	vnode->cb_scrub = cb_scrub;
458 
459 	/* if the vnode's data version number changed then its contents are
460 	 * different */
461 	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
462 	if (zap)
463 		afs_zap_data(vnode);
464 	up_write(&vnode->validate_lock);
465 	_leave(" = 0");
466 	return 0;
467 
468 error_unlock:
469 	if (locked_vol)
470 		mutex_unlock(&volume->cb_check_lock);
471 	up_write(&vnode->validate_lock);
472 error:
473 	_leave(" = %d", ret);
474 	return ret;
475 }
476