xref: /netbsd-src/sys/kern/vfs_trans.c (revision e94a5d02693120d4ad9d909e488894e9fcf0eb76)
1 /*	$NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Juergen Hannken-Illjes.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $");
34 
35 /*
36  * File system transaction operations.
37  */
38 
39 #ifdef _KERNEL_OPT
40 #include "opt_ddb.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 
46 #include <sys/atomic.h>
47 #include <sys/buf.h>
48 #include <sys/fstrans.h>
49 #include <sys/hash.h>
50 #include <sys/kmem.h>
51 #include <sys/mount.h>
52 #include <sys/pool.h>
53 #include <sys/proc.h>
54 #include <sys/pserialize.h>
55 #include <sys/sdt.h>
56 #include <sys/systm.h>
57 #include <sys/vnode.h>
58 
59 #include <miscfs/deadfs/deadfs.h>
60 #include <miscfs/specfs/specdev.h>
61 
62 #define FSTRANS_MOUNT_HASHSIZE	32
63 
64 enum fstrans_lock_type {
65 	FSTRANS_LAZY,			/* Granted while not suspended */
66 	FSTRANS_SHARED			/* Granted while not suspending */
67 };
68 
69 struct fscow_handler {
70 	LIST_ENTRY(fscow_handler) ch_list;
71 	int (*ch_func)(void *, struct buf *, bool);
72 	void *ch_arg;
73 };
74 struct fstrans_lwp_info {
75 	struct fstrans_lwp_info *fli_succ;
76 	struct lwp *fli_self;
77 	struct mount *fli_mount;
78 	struct fstrans_lwp_info *fli_alias;
79 	struct fstrans_mount_info *fli_mountinfo;
80 	int fli_trans_cnt;
81 	int fli_alias_cnt;
82 	int fli_cow_cnt;
83 	enum fstrans_lock_type fli_lock_type;
84 	LIST_ENTRY(fstrans_lwp_info) fli_list;
85 };
86 struct fstrans_mount_info {
87 	enum fstrans_state fmi_state;
88 	unsigned int fmi_ref_cnt;
89 	bool fmi_gone;
90 	bool fmi_cow_change;
91 	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
92 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
93 	struct mount *fmi_mount;
94 	struct fstrans_mount_info *fmi_lower_info;
95 	struct lwp *fmi_owner;
96 };
97 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
98 
99 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
100     __cacheline_aligned;
101 static kmutex_t fstrans_lock		/* Fstrans big lock. */
102     __cacheline_aligned;
103 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
104 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
105 static pserialize_t fstrans_psz;	/* Pserialize state. */
106 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
107 					/* List of all fstrans_lwp_info. */
108 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
109 
110 static u_long fstrans_mount_hashmask;
111 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
112 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
113 
114 static inline uint32_t fstrans_mount_hash(struct mount *);
115 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
116 static void fstrans_mount_dtor(struct fstrans_mount_info *);
117 static void fstrans_clear_lwp_info(void);
118 static inline struct fstrans_lwp_info *
119     fstrans_get_lwp_info(struct mount *, bool);
120 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
121 static int fstrans_lwp_pcc(void *, void *, int);
122 static void fstrans_lwp_pcd(void *, void *);
123 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
124 static bool grant_lock(const struct fstrans_mount_info *,
125     const enum fstrans_lock_type);
126 static bool state_change_done(const struct fstrans_mount_info *);
127 static bool cow_state_change_done(const struct fstrans_mount_info *);
128 static void cow_change_enter(struct fstrans_mount_info *);
129 static void cow_change_done(struct fstrans_mount_info *);
130 
131 /*
132  * Initialize.
133  */
134 void
135 fstrans_init(void)
136 {
137 
138 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
139 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
140 	cv_init(&fstrans_state_cv, "fstchg");
141 	cv_init(&fstrans_count_cv, "fstcnt");
142 	fstrans_psz = pserialize_create();
143 	LIST_INIT(&fstrans_fli_head);
144 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
145 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
146 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
147 	KASSERT(fstrans_lwp_cache != NULL);
148 	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
149 	    true, &fstrans_mount_hashmask);
150 }
151 
152 /*
153  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
154  * produces cache misses on MP.  Minimise by keeping free entries on list.
155  */
156 int
157 fstrans_lwp_pcc(void *arg, void *obj, int flags)
158 {
159 	struct fstrans_lwp_info *fli = obj;
160 
161 	memset(fli, 0, sizeof(*fli));
162 
163 	mutex_enter(&fstrans_lock);
164 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
165 	mutex_exit(&fstrans_lock);
166 
167 	return 0;
168 }
169 
170 /*
171  * pool_cache destructor
172  */
173 void
174 fstrans_lwp_pcd(void *arg, void *obj)
175 {
176 	struct fstrans_lwp_info *fli = obj;
177 
178 	mutex_enter(&fstrans_lock);
179 	LIST_REMOVE(fli, fli_list);
180 	mutex_exit(&fstrans_lock);
181 }
182 
183 /*
184  * Deallocate lwp state.
185  */
186 void
187 fstrans_lwp_dtor(lwp_t *l)
188 {
189 	struct fstrans_lwp_info *fli, *fli_next;
190 
191 	if (l->l_fstrans == NULL)
192 		return;
193 
194 	mutex_enter(&fstrans_lock);
195 	for (fli = l->l_fstrans; fli; fli = fli_next) {
196 		KASSERT(fli->fli_trans_cnt == 0);
197 		KASSERT(fli->fli_cow_cnt == 0);
198 		KASSERT(fli->fli_self == l);
199 		if (fli->fli_mount != NULL)
200 			fstrans_mount_dtor(fli->fli_mountinfo);
201 		fli_next = fli->fli_succ;
202 		fli->fli_alias_cnt = 0;
203 		fli->fli_mount = NULL;
204 		fli->fli_alias = NULL;
205 		fli->fli_mountinfo = NULL;
206 		fli->fli_self = NULL;
207 	}
208 	mutex_exit(&fstrans_lock);
209 
210 	for (fli = l->l_fstrans; fli; fli = fli_next) {
211 		fli_next = fli->fli_succ;
212 		pool_cache_put(fstrans_lwp_cache, fli);
213 	}
214 	l->l_fstrans = NULL;
215 }
216 
217 /*
218  * mount pointer to hash
219  */
220 static inline uint32_t
221 fstrans_mount_hash(struct mount *mp)
222 {
223 
224 	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
225 	    fstrans_mount_hashmask;
226 }
227 
228 /*
229  * retrieve fstrans_mount_info by mount or NULL
230  */
231 static inline struct fstrans_mount_info *
232 fstrans_mount_get(struct mount *mp)
233 {
234 	uint32_t indx;
235 	struct fstrans_mount_info *fmi, *fmi_lower;
236 
237 	KASSERT(mutex_owned(&fstrans_lock));
238 
239 	indx = fstrans_mount_hash(mp);
240 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
241 		if (fmi->fmi_mount == mp) {
242 			if (__predict_false(mp->mnt_lower != NULL &&
243 			    fmi->fmi_lower_info == NULL)) {
244 				/*
245 				 * Intern the lower/lowest mount into
246 				 * this mount info on first lookup.
247 				 */
248 				KASSERT(fmi->fmi_ref_cnt == 1);
249 
250 				fmi_lower = fstrans_mount_get(mp->mnt_lower);
251 				if (fmi_lower && fmi_lower->fmi_lower_info)
252 					fmi_lower = fmi_lower->fmi_lower_info;
253 				if (fmi_lower == NULL)
254 					return NULL;
255 				fmi->fmi_lower_info = fmi_lower;
256 				fmi->fmi_lower_info->fmi_ref_cnt += 1;
257 			}
258 			return fmi;
259 		}
260 	}
261 
262 	return NULL;
263 }
264 
265 /*
266  * Dereference mount state.
267  */
268 static void
269 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
270 {
271 
272 	KASSERT(mutex_owned(&fstrans_lock));
273 
274 	KASSERT(fmi != NULL);
275 	fmi->fmi_ref_cnt -= 1;
276 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
277 		return;
278 	}
279 
280 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
281 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
282 	KASSERT(fmi->fmi_owner == NULL);
283 
284 	if (fmi->fmi_lower_info)
285 		fstrans_mount_dtor(fmi->fmi_lower_info);
286 
287 	KASSERT(fstrans_gone_count > 0);
288 	fstrans_gone_count -= 1;
289 
290 	KASSERT(fmi->fmi_mount->mnt_lower == NULL);
291 
292 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
293 	kmem_free(fmi, sizeof(*fmi));
294 }
295 
296 /*
297  * Allocate mount state.
298  */
299 int
300 fstrans_mount(struct mount *mp)
301 {
302 	uint32_t indx;
303 	struct fstrans_mount_info *newfmi;
304 
305 	indx = fstrans_mount_hash(mp);
306 
307 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
308 	newfmi->fmi_state = FSTRANS_NORMAL;
309 	newfmi->fmi_ref_cnt = 1;
310 	newfmi->fmi_gone = false;
311 	LIST_INIT(&newfmi->fmi_cow_handler);
312 	newfmi->fmi_cow_change = false;
313 	newfmi->fmi_mount = mp;
314 	newfmi->fmi_lower_info = NULL;
315 	newfmi->fmi_owner = NULL;
316 
317 	mutex_enter(&fstrans_lock);
318 	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
319 	mutex_exit(&fstrans_lock);
320 
321 	return 0;
322 }
323 
324 /*
325  * Deallocate mount state.
326  */
327 void
328 fstrans_unmount(struct mount *mp)
329 {
330 	uint32_t indx;
331 	struct fstrans_mount_info *fmi;
332 
333 	indx = fstrans_mount_hash(mp);
334 
335 	mutex_enter(&fstrans_lock);
336 	fmi = fstrans_mount_get(mp);
337 	KASSERT(fmi != NULL);
338 	fmi->fmi_gone = true;
339 	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
340 	    fmi, fstrans_mount_info, fmi_hash);
341 	fstrans_gone_count += 1;
342 	fstrans_mount_dtor(fmi);
343 	mutex_exit(&fstrans_lock);
344 }
345 
346 /*
347  * Clear mount entries whose mount is gone.
348  */
349 static void
350 fstrans_clear_lwp_info(void)
351 {
352 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
353 
354 	/*
355 	 * Scan our list clearing entries whose mount is gone.
356 	 */
357 	mutex_enter(&fstrans_lock);
358 	for (p = &curlwp->l_fstrans; *p; ) {
359 		fli = *p;
360 		if (fli->fli_mount != NULL &&
361 		    fli->fli_mountinfo->fmi_gone &&
362 		    fli->fli_trans_cnt == 0 &&
363 		    fli->fli_cow_cnt == 0 &&
364 		    fli->fli_alias_cnt == 0) {
365 			*p = (*p)->fli_succ;
366 			fstrans_mount_dtor(fli->fli_mountinfo);
367 			if (fli->fli_alias) {
368 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
369 				fli->fli_alias->fli_alias_cnt--;
370 			}
371 			fli->fli_mount = NULL;
372 			fli->fli_alias = NULL;
373 			fli->fli_mountinfo = NULL;
374 			fli->fli_self = NULL;
375 			p = &curlwp->l_fstrans;
376 			fli->fli_succ = tofree;
377 			tofree = fli;
378 		} else {
379 			p = &(*p)->fli_succ;
380 		}
381 	}
382 #ifdef DIAGNOSTIC
383 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
384 		if (fli->fli_alias != NULL)
385 			KASSERT(fli->fli_alias->fli_self == curlwp);
386 #endif /* DIAGNOSTIC */
387 	mutex_exit(&fstrans_lock);
388 
389 	while (tofree != NULL) {
390 		fli = tofree;
391 		tofree = fli->fli_succ;
392 		pool_cache_put(fstrans_lwp_cache, fli);
393 	}
394 }
395 
396 /*
397  * Allocate and return per lwp info for this mount.
398  */
399 static struct fstrans_lwp_info *
400 fstrans_alloc_lwp_info(struct mount *mp)
401 {
402 	struct fstrans_lwp_info *fli, *fli_lower;
403 	struct fstrans_mount_info *fmi;
404 
405 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
406 		if (fli->fli_mount == mp)
407 			return fli;
408 	}
409 
410 	/*
411 	 * Lookup mount info and get lower mount per lwp info.
412 	 */
413 	mutex_enter(&fstrans_lock);
414 	fmi = fstrans_mount_get(mp);
415 	if (fmi == NULL) {
416 		mutex_exit(&fstrans_lock);
417 		return NULL;
418 	}
419 	fmi->fmi_ref_cnt += 1;
420 	mutex_exit(&fstrans_lock);
421 
422 	if (fmi->fmi_lower_info) {
423 		fli_lower =
424 		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
425 		if (fli_lower == NULL) {
426 			mutex_enter(&fstrans_lock);
427 			fstrans_mount_dtor(fmi);
428 			mutex_exit(&fstrans_lock);
429 
430 			return NULL;
431 		}
432 	} else {
433 		fli_lower = NULL;
434 	}
435 
436 	/*
437 	 * Allocate a new entry.
438 	 */
439 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
440 	KASSERT(fli->fli_trans_cnt == 0);
441 	KASSERT(fli->fli_cow_cnt == 0);
442 	KASSERT(fli->fli_alias_cnt == 0);
443 	KASSERT(fli->fli_mount == NULL);
444 	KASSERT(fli->fli_alias == NULL);
445 	KASSERT(fli->fli_mountinfo == NULL);
446 	KASSERT(fli->fli_self == NULL);
447 
448 	/*
449 	 * Attach the mount info and alias.
450 	 */
451 
452 	fli->fli_self = curlwp;
453 	fli->fli_mount = mp;
454 	fli->fli_mountinfo = fmi;
455 
456 	fli->fli_succ = curlwp->l_fstrans;
457 	curlwp->l_fstrans = fli;
458 
459 	if (fli_lower) {
460 		fli->fli_alias = fli_lower;
461 		fli->fli_alias->fli_alias_cnt++;
462 		fli = fli->fli_alias;
463 	}
464 
465 	return fli;
466 }
467 
468 /*
469  * Retrieve the per lwp info for this mount allocating if necessary.
470  */
471 static inline struct fstrans_lwp_info *
472 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
473 {
474 	struct fstrans_lwp_info *fli;
475 
476 	/*
477 	 * Scan our list for a match.
478 	 */
479 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
480 		if (fli->fli_mount == mp) {
481 			KASSERT(mp->mnt_lower == NULL ||
482 			    fli->fli_alias != NULL);
483 			if (fli->fli_alias != NULL)
484 				fli = fli->fli_alias;
485 			break;
486 		}
487 	}
488 
489 	if (do_alloc) {
490 		if (__predict_false(fli == NULL))
491 			fli = fstrans_alloc_lwp_info(mp);
492 	}
493 
494 	return fli;
495 }
496 
497 /*
498  * Check if this lock type is granted at this state.
499  */
500 static bool
501 grant_lock(const struct fstrans_mount_info *fmi,
502     const enum fstrans_lock_type type)
503 {
504 
505 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
506 		return true;
507 	if (fmi->fmi_owner == curlwp)
508 		return true;
509 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
510 		return true;
511 
512 	return false;
513 }
514 
515 /*
516  * Start a transaction.  If this thread already has a transaction on this
517  * file system increment the reference counter.
518  */
519 static inline int
520 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
521 {
522 	int s;
523 	struct fstrans_lwp_info *fli;
524 	struct fstrans_mount_info *fmi;
525 
526 	ASSERT_SLEEPABLE();
527 
528 	fli = fstrans_get_lwp_info(mp, true);
529 	if (fli == NULL)
530 		return 0;
531 	fmi = fli->fli_mountinfo;
532 
533 	if (fli->fli_trans_cnt > 0) {
534 		fli->fli_trans_cnt += 1;
535 
536 		return 0;
537 	}
538 
539 	s = pserialize_read_enter();
540 	if (__predict_true(grant_lock(fmi, lock_type))) {
541 		fli->fli_trans_cnt = 1;
542 		fli->fli_lock_type = lock_type;
543 		pserialize_read_exit(s);
544 
545 		return 0;
546 	}
547 	pserialize_read_exit(s);
548 
549 	if (! wait)
550 		return SET_ERROR(EBUSY);
551 
552 	mutex_enter(&fstrans_lock);
553 	while (! grant_lock(fmi, lock_type))
554 		cv_wait(&fstrans_state_cv, &fstrans_lock);
555 	fli->fli_trans_cnt = 1;
556 	fli->fli_lock_type = lock_type;
557 	mutex_exit(&fstrans_lock);
558 
559 	return 0;
560 }
561 
562 void
563 fstrans_start(struct mount *mp)
564 {
565 	int error __diagused;
566 
567 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
568 	KASSERT(error == 0);
569 }
570 
571 int
572 fstrans_start_nowait(struct mount *mp)
573 {
574 
575 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
576 }
577 
578 void
579 fstrans_start_lazy(struct mount *mp)
580 {
581 	int error __diagused;
582 
583 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
584 	KASSERT(error == 0);
585 }
586 
587 /*
588  * Finish a transaction.
589  */
590 void
591 fstrans_done(struct mount *mp)
592 {
593 	int s;
594 	struct fstrans_lwp_info *fli;
595 	struct fstrans_mount_info *fmi;
596 
597 	fli = fstrans_get_lwp_info(mp, false);
598 	if (fli == NULL)
599 		return;
600 	fmi = fli->fli_mountinfo;
601 	KASSERT(fli->fli_trans_cnt > 0);
602 
603 	if (fli->fli_trans_cnt > 1) {
604 		fli->fli_trans_cnt -= 1;
605 
606 		return;
607 	}
608 
609 	if (__predict_false(fstrans_gone_count > 0))
610 		fstrans_clear_lwp_info();
611 
612 	s = pserialize_read_enter();
613 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
614 		fli->fli_trans_cnt = 0;
615 		pserialize_read_exit(s);
616 
617 		return;
618 	}
619 	pserialize_read_exit(s);
620 
621 	mutex_enter(&fstrans_lock);
622 	fli->fli_trans_cnt = 0;
623 	cv_signal(&fstrans_count_cv);
624 	mutex_exit(&fstrans_lock);
625 }
626 
627 /*
628  * Check if we hold an lock.
629  */
630 int
631 fstrans_held(struct mount *mp)
632 {
633 	struct fstrans_lwp_info *fli;
634 	struct fstrans_mount_info *fmi;
635 
636 	KASSERT(mp != dead_rootmount);
637 
638 	fli = fstrans_get_lwp_info(mp, false);
639 	if (fli == NULL)
640 		return 0;
641 	fmi = fli->fli_mountinfo;
642 
643 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
644 }
645 
646 /*
647  * Check if this thread has an exclusive lock.
648  */
649 int
650 fstrans_is_owner(struct mount *mp)
651 {
652 	struct fstrans_lwp_info *fli;
653 	struct fstrans_mount_info *fmi;
654 
655 	KASSERT(mp != dead_rootmount);
656 
657 	fli = fstrans_get_lwp_info(mp, false);
658 	if (fli == NULL)
659 		return 0;
660 	fmi = fli->fli_mountinfo;
661 
662 	return (fmi->fmi_owner == curlwp);
663 }
664 
665 /*
666  * True, if no thread is in a transaction not granted at the current state.
667  */
668 static bool
669 state_change_done(const struct fstrans_mount_info *fmi)
670 {
671 	struct fstrans_lwp_info *fli;
672 
673 	KASSERT(mutex_owned(&fstrans_lock));
674 
675 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
676 		if (fli->fli_mountinfo != fmi)
677 			continue;
678 		if (fli->fli_trans_cnt == 0)
679 			continue;
680 		if (fli->fli_self == curlwp)
681 			continue;
682 		if (grant_lock(fmi, fli->fli_lock_type))
683 			continue;
684 
685 		return false;
686 	}
687 
688 	return true;
689 }
690 
691 /*
692  * Set new file system state.
693  */
694 int
695 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
696 {
697 	int error;
698 	enum fstrans_state old_state;
699 	struct fstrans_lwp_info *fli;
700 	struct fstrans_mount_info *fmi;
701 
702 	KASSERT(mp != dead_rootmount);
703 
704 	fli = fstrans_get_lwp_info(mp, true);
705 	if (fli == NULL)
706 		return SET_ERROR(ENOENT);
707 	fmi = fli->fli_mountinfo;
708 	old_state = fmi->fmi_state;
709 	if (old_state == new_state)
710 		return 0;
711 
712 	mutex_enter(&fstrans_lock);
713 	fmi->fmi_state = new_state;
714 	pserialize_perform(fstrans_psz);
715 
716 	/*
717 	 * All threads see the new state now.
718 	 * Wait for transactions invalid at this state to leave.
719 	 */
720 	error = 0;
721 	while (! state_change_done(fmi)) {
722 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
723 		if (error) {
724 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
725 			break;
726 		}
727 	}
728 	if (old_state != new_state) {
729 		if (old_state == FSTRANS_NORMAL) {
730 			KASSERT(fmi->fmi_owner == NULL);
731 			fmi->fmi_owner = curlwp;
732 		}
733 		if (new_state == FSTRANS_NORMAL) {
734 			KASSERT(fmi->fmi_owner == curlwp);
735 			fmi->fmi_owner = NULL;
736 		}
737 	}
738 	cv_broadcast(&fstrans_state_cv);
739 	mutex_exit(&fstrans_lock);
740 
741 	return error;
742 }
743 
744 /*
745  * Get current file system state.
746  */
747 enum fstrans_state
748 fstrans_getstate(struct mount *mp)
749 {
750 	struct fstrans_lwp_info *fli;
751 	struct fstrans_mount_info *fmi;
752 
753 	KASSERT(mp != dead_rootmount);
754 
755 	fli = fstrans_get_lwp_info(mp, true);
756 	KASSERT(fli != NULL);
757 	fmi = fli->fli_mountinfo;
758 
759 	return fmi->fmi_state;
760 }
761 
762 /*
763  * Request a filesystem to suspend all operations.
764  */
765 int
766 vfs_suspend(struct mount *mp, int nowait)
767 {
768 	struct fstrans_lwp_info *fli;
769 	int error;
770 
771 	if (mp == dead_rootmount)
772 		return SET_ERROR(EOPNOTSUPP);
773 
774 	fli = fstrans_get_lwp_info(mp, true);
775 	if (fli == NULL)
776 		return SET_ERROR(ENOENT);
777 
778 	if (nowait) {
779 		if (!mutex_tryenter(&vfs_suspend_lock))
780 			return SET_ERROR(EWOULDBLOCK);
781 	} else
782 		mutex_enter(&vfs_suspend_lock);
783 
784 	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
785 		mutex_exit(&vfs_suspend_lock);
786 		return error;
787 	}
788 
789 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
790 		vfs_resume(mp);
791 		return SET_ERROR(ENOENT);
792 	}
793 
794 	return 0;
795 }
796 
797 /*
798  * Request a filesystem to resume all operations.
799  */
800 void
801 vfs_resume(struct mount *mp)
802 {
803 	struct fstrans_lwp_info *fli;
804 
805 	KASSERT(mp != dead_rootmount);
806 
807 	fli = fstrans_get_lwp_info(mp, false);
808 	mp = fli->fli_mount;
809 
810 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
811 	mutex_exit(&vfs_suspend_lock);
812 }
813 
814 /*
815  * True, if no thread is running a cow handler.
816  */
817 static bool
818 cow_state_change_done(const struct fstrans_mount_info *fmi)
819 {
820 	struct fstrans_lwp_info *fli;
821 
822 	KASSERT(mutex_owned(&fstrans_lock));
823 	KASSERT(fmi->fmi_cow_change);
824 
825 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
826 		if (fli->fli_mount != fmi->fmi_mount)
827 			continue;
828 		if (fli->fli_cow_cnt == 0)
829 			continue;
830 
831 		return false;
832 	}
833 
834 	return true;
835 }
836 
837 /*
838  * Prepare for changing this mounts cow list.
839  * Returns with fstrans_lock locked.
840  */
841 static void
842 cow_change_enter(struct fstrans_mount_info *fmi)
843 {
844 
845 	mutex_enter(&fstrans_lock);
846 
847 	/*
848 	 * Wait for other threads changing the list.
849 	 */
850 	while (fmi->fmi_cow_change)
851 		cv_wait(&fstrans_state_cv, &fstrans_lock);
852 
853 	/*
854 	 * Wait until all threads are aware of a state change.
855 	 */
856 	fmi->fmi_cow_change = true;
857 	pserialize_perform(fstrans_psz);
858 
859 	while (! cow_state_change_done(fmi))
860 		cv_wait(&fstrans_count_cv, &fstrans_lock);
861 }
862 
863 /*
864  * Done changing this mounts cow list.
865  */
866 static void
867 cow_change_done(struct fstrans_mount_info *fmi)
868 {
869 
870 	KASSERT(mutex_owned(&fstrans_lock));
871 
872 	fmi->fmi_cow_change = false;
873 	pserialize_perform(fstrans_psz);
874 
875 	cv_broadcast(&fstrans_state_cv);
876 
877 	mutex_exit(&fstrans_lock);
878 }
879 
880 /*
881  * Add a handler to this mount.
882  */
883 int
884 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
885     void *arg)
886 {
887 	struct fstrans_mount_info *fmi;
888 	struct fscow_handler *newch;
889 
890 	KASSERT(mp != dead_rootmount);
891 
892 	mutex_enter(&fstrans_lock);
893 	fmi = fstrans_mount_get(mp);
894 	KASSERT(fmi != NULL);
895 	fmi->fmi_ref_cnt += 1;
896 	mutex_exit(&fstrans_lock);
897 
898 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
899 	newch->ch_func = func;
900 	newch->ch_arg = arg;
901 
902 	cow_change_enter(fmi);
903 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
904 	cow_change_done(fmi);
905 
906 	return 0;
907 }
908 
909 /*
910  * Remove a handler from this mount.
911  */
912 int
913 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
914     void *arg)
915 {
916 	struct fstrans_mount_info *fmi;
917 	struct fscow_handler *hp = NULL;
918 
919 	KASSERT(mp != dead_rootmount);
920 
921 	mutex_enter(&fstrans_lock);
922 	fmi = fstrans_mount_get(mp);
923 	KASSERT(fmi != NULL);
924 	mutex_exit(&fstrans_lock);
925 
926 	cow_change_enter(fmi);
927 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
928 		if (hp->ch_func == func && hp->ch_arg == arg)
929 			break;
930 	if (hp != NULL) {
931 		LIST_REMOVE(hp, ch_list);
932 		kmem_free(hp, sizeof(*hp));
933 	}
934 	fstrans_mount_dtor(fmi);
935 	cow_change_done(fmi);
936 
937 	return hp ? 0 : SET_ERROR(EINVAL);
938 }
939 
940 /*
941  * Check for need to copy block that is about to be written.
942  */
943 int
944 fscow_run(struct buf *bp, bool data_valid)
945 {
946 	int error, s;
947 	struct mount *mp;
948 	struct fstrans_lwp_info *fli;
949 	struct fstrans_mount_info *fmi;
950 	struct fscow_handler *hp;
951 
952 	/*
953 	 * First check if we need run the copy-on-write handler.
954 	 */
955 	if ((bp->b_flags & B_COWDONE))
956 		return 0;
957 	if (bp->b_vp == NULL) {
958 		bp->b_flags |= B_COWDONE;
959 		return 0;
960 	}
961 	if (bp->b_vp->v_type == VBLK)
962 		mp = spec_node_getmountedfs(bp->b_vp);
963 	else
964 		mp = bp->b_vp->v_mount;
965 	if (mp == NULL || mp == dead_rootmount) {
966 		bp->b_flags |= B_COWDONE;
967 		return 0;
968 	}
969 
970 	fli = fstrans_get_lwp_info(mp, true);
971 	KASSERT(fli != NULL);
972 	fmi = fli->fli_mountinfo;
973 
974 	/*
975 	 * On non-recursed run check if other threads
976 	 * want to change the list.
977 	 */
978 	if (fli->fli_cow_cnt == 0) {
979 		s = pserialize_read_enter();
980 		if (__predict_false(fmi->fmi_cow_change)) {
981 			pserialize_read_exit(s);
982 			mutex_enter(&fstrans_lock);
983 			while (fmi->fmi_cow_change)
984 				cv_wait(&fstrans_state_cv, &fstrans_lock);
985 			fli->fli_cow_cnt = 1;
986 			mutex_exit(&fstrans_lock);
987 		} else {
988 			fli->fli_cow_cnt = 1;
989 			pserialize_read_exit(s);
990 		}
991 	} else
992 		fli->fli_cow_cnt += 1;
993 
994 	/*
995 	 * Run all copy-on-write handlers, stop on error.
996 	 */
997 	error = 0;
998 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
999 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
1000 			break;
1001 	if (error == 0)
1002 		bp->b_flags |= B_COWDONE;
1003 
1004 	/*
1005 	 * Check if other threads want to change the list.
1006 	 */
1007 	if (fli->fli_cow_cnt > 1) {
1008 		fli->fli_cow_cnt -= 1;
1009 	} else {
1010 		s = pserialize_read_enter();
1011 		if (__predict_false(fmi->fmi_cow_change)) {
1012 			pserialize_read_exit(s);
1013 			mutex_enter(&fstrans_lock);
1014 			fli->fli_cow_cnt = 0;
1015 			cv_signal(&fstrans_count_cv);
1016 			mutex_exit(&fstrans_lock);
1017 		} else {
1018 			fli->fli_cow_cnt = 0;
1019 			pserialize_read_exit(s);
1020 		}
1021 	}
1022 
1023 	return error;
1024 }
1025 
1026 #if defined(DDB)
1027 void fstrans_dump(int);
1028 
1029 static void
1030 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1031 {
1032 	char prefix[9];
1033 	struct fstrans_lwp_info *fli;
1034 
1035 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1036 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1037 		if (fli->fli_self != l)
1038 			continue;
1039 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1040 			if (! verbose)
1041 				continue;
1042 		}
1043 		printf("%-8s", prefix);
1044 		if (verbose)
1045 			printf(" @%p", fli);
1046 		if (fli->fli_mount == dead_rootmount)
1047 			printf(" <dead>");
1048 		else if (fli->fli_mount != NULL)
1049 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1050 		else
1051 			printf(" NULL");
1052 		if (fli->fli_alias != NULL) {
1053 			struct mount *amp = fli->fli_alias->fli_mount;
1054 
1055 			printf(" alias");
1056 			if (verbose)
1057 				printf(" @%p", fli->fli_alias);
1058 			if (amp == NULL)
1059 				printf(" NULL");
1060 			else
1061 				printf(" (%s)", amp->mnt_stat.f_mntonname);
1062 		}
1063 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1064 			printf(" gone");
1065 		if (fli->fli_trans_cnt == 0) {
1066 			printf(" -");
1067 		} else {
1068 			switch (fli->fli_lock_type) {
1069 			case FSTRANS_LAZY:
1070 				printf(" lazy");
1071 				break;
1072 			case FSTRANS_SHARED:
1073 				printf(" shared");
1074 				break;
1075 			default:
1076 				printf(" %#x", fli->fli_lock_type);
1077 				break;
1078 			}
1079 		}
1080 		printf(" %d cow %d alias %d\n",
1081 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1082 		prefix[0] = '\0';
1083 	}
1084 }
1085 
1086 static void
1087 fstrans_print_mount(struct mount *mp, int verbose)
1088 {
1089 	uint32_t indx;
1090 	struct fstrans_mount_info *fmi;
1091 
1092 	indx = fstrans_mount_hash(mp);
1093 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1094 		if (fmi->fmi_mount == mp)
1095 			break;
1096 
1097 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1098 		return;
1099 
1100 	printf("%-16s ", mp->mnt_stat.f_mntonname);
1101 	if (fmi == NULL) {
1102 		printf("(null)\n");
1103 		return;
1104 	}
1105 	printf("owner %p ", fmi->fmi_owner);
1106 	switch (fmi->fmi_state) {
1107 	case FSTRANS_NORMAL:
1108 		printf("state normal\n");
1109 		break;
1110 	case FSTRANS_SUSPENDING:
1111 		printf("state suspending\n");
1112 		break;
1113 	case FSTRANS_SUSPENDED:
1114 		printf("state suspended\n");
1115 		break;
1116 	default:
1117 		printf("state %#x\n", fmi->fmi_state);
1118 		break;
1119 	}
1120 }
1121 
1122 void
1123 fstrans_dump(int full)
1124 {
1125 	const struct proclist_desc *pd;
1126 	struct proc *p;
1127 	struct lwp *l;
1128 	struct mount *mp;
1129 
1130 	printf("Fstrans locks by lwp:\n");
1131 	for (pd = proclists; pd->pd_list != NULL; pd++)
1132 		PROCLIST_FOREACH(p, pd->pd_list)
1133 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1134 				fstrans_print_lwp(p, l, full == 1);
1135 
1136 	printf("Fstrans state by mount:\n");
1137 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1138 		fstrans_print_mount(mp, full == 1);
1139 }
1140 #endif /* defined(DDB) */
1141