xref: /netbsd-src/sys/kern/vfs_trans.c (revision 5f2f42719cd62ff11fd913b40b7ce19f07c4fd25)
1 /*	$NetBSD: vfs_trans.c,v 1.68 2022/08/22 09:13:08 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Juergen Hannken-Illjes.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.68 2022/08/22 09:13:08 hannken Exp $");
34 
35 /*
36  * File system transaction operations.
37  */
38 
39 #ifdef _KERNEL_OPT
40 #include "opt_ddb.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/atomic.h>
46 #include <sys/buf.h>
47 #include <sys/hash.h>
48 #include <sys/kmem.h>
49 #include <sys/mount.h>
50 #include <sys/pserialize.h>
51 #include <sys/vnode.h>
52 #include <sys/fstrans.h>
53 #include <sys/proc.h>
54 #include <sys/pool.h>
55 
56 #include <miscfs/specfs/specdev.h>
57 
58 #define FSTRANS_MOUNT_HASHSIZE	32
59 
60 enum fstrans_lock_type {
61 	FSTRANS_LAZY,			/* Granted while not suspended */
62 	FSTRANS_SHARED			/* Granted while not suspending */
63 };
64 
65 struct fscow_handler {
66 	LIST_ENTRY(fscow_handler) ch_list;
67 	int (*ch_func)(void *, struct buf *, bool);
68 	void *ch_arg;
69 };
70 struct fstrans_lwp_info {
71 	struct fstrans_lwp_info *fli_succ;
72 	struct lwp *fli_self;
73 	struct mount *fli_mount;
74 	struct fstrans_lwp_info *fli_alias;
75 	struct fstrans_mount_info *fli_mountinfo;
76 	int fli_trans_cnt;
77 	int fli_alias_cnt;
78 	int fli_cow_cnt;
79 	enum fstrans_lock_type fli_lock_type;
80 	LIST_ENTRY(fstrans_lwp_info) fli_list;
81 };
82 struct fstrans_mount_info {
83 	enum fstrans_state fmi_state;
84 	unsigned int fmi_ref_cnt;
85 	bool fmi_gone;
86 	bool fmi_cow_change;
87 	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
88 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
89 	struct mount *fmi_mount;
90 	struct fstrans_mount_info *fmi_lower_info;
91 	struct lwp *fmi_owner;
92 };
93 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
94 
95 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
96     __cacheline_aligned;
97 static kmutex_t fstrans_lock		/* Fstrans big lock. */
98     __cacheline_aligned;
99 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
100 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
101 static pserialize_t fstrans_psz;	/* Pserialize state. */
102 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
103 					/* List of all fstrans_lwp_info. */
104 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
105 
106 static u_long fstrans_mount_hashmask;
107 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
108 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
109 
110 static inline uint32_t fstrans_mount_hash(struct mount *);
111 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
112 static void fstrans_mount_dtor(struct fstrans_mount_info *);
113 static void fstrans_clear_lwp_info(void);
114 static inline struct fstrans_lwp_info *
115     fstrans_get_lwp_info(struct mount *, bool);
116 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
117 static int fstrans_lwp_pcc(void *, void *, int);
118 static void fstrans_lwp_pcd(void *, void *);
119 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
120 static bool grant_lock(const struct fstrans_mount_info *,
121     const enum fstrans_lock_type);
122 static bool state_change_done(const struct fstrans_mount_info *);
123 static bool cow_state_change_done(const struct fstrans_mount_info *);
124 static void cow_change_enter(struct fstrans_mount_info *);
125 static void cow_change_done(struct fstrans_mount_info *);
126 
127 extern struct mount *dead_rootmount;
128 
129 /*
130  * Initialize.
131  */
132 void
133 fstrans_init(void)
134 {
135 
136 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
137 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
138 	cv_init(&fstrans_state_cv, "fstchg");
139 	cv_init(&fstrans_count_cv, "fstcnt");
140 	fstrans_psz = pserialize_create();
141 	LIST_INIT(&fstrans_fli_head);
142 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
143 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
144 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
145 	KASSERT(fstrans_lwp_cache != NULL);
146 	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
147 	    true, &fstrans_mount_hashmask);
148 }
149 
150 /*
151  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
152  * produces cache misses on MP.  Minimise by keeping free entries on list.
153  */
154 int
155 fstrans_lwp_pcc(void *arg, void *obj, int flags)
156 {
157 	struct fstrans_lwp_info *fli = obj;
158 
159 	memset(fli, 0, sizeof(*fli));
160 
161 	mutex_enter(&fstrans_lock);
162 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
163 	mutex_exit(&fstrans_lock);
164 
165 	return 0;
166 }
167 
168 /*
169  * pool_cache destructor
170  */
171 void
172 fstrans_lwp_pcd(void *arg, void *obj)
173 {
174 	struct fstrans_lwp_info *fli = obj;
175 
176 	mutex_enter(&fstrans_lock);
177 	LIST_REMOVE(fli, fli_list);
178 	mutex_exit(&fstrans_lock);
179 }
180 
181 /*
182  * Deallocate lwp state.
183  */
184 void
185 fstrans_lwp_dtor(lwp_t *l)
186 {
187 	struct fstrans_lwp_info *fli, *fli_next;
188 
189 	if (l->l_fstrans == NULL)
190 		return;
191 
192 	mutex_enter(&fstrans_lock);
193 	for (fli = l->l_fstrans; fli; fli = fli_next) {
194 		KASSERT(fli->fli_trans_cnt == 0);
195 		KASSERT(fli->fli_cow_cnt == 0);
196 		KASSERT(fli->fli_self == l);
197 		if (fli->fli_mount != NULL)
198 			fstrans_mount_dtor(fli->fli_mountinfo);
199 		fli_next = fli->fli_succ;
200 		fli->fli_alias_cnt = 0;
201 		fli->fli_mount = NULL;
202 		fli->fli_alias = NULL;
203 		fli->fli_mountinfo = NULL;
204 		fli->fli_self = NULL;
205 	}
206 	mutex_exit(&fstrans_lock);
207 
208 	for (fli = l->l_fstrans; fli; fli = fli_next) {
209 		fli_next = fli->fli_succ;
210 		pool_cache_put(fstrans_lwp_cache, fli);
211 	}
212 	l->l_fstrans = NULL;
213 }
214 
215 /*
216  * mount pointer to hash
217  */
218 static inline uint32_t
219 fstrans_mount_hash(struct mount *mp)
220 {
221 
222 	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
223 	    fstrans_mount_hashmask;
224 }
225 
226 /*
227  * retrieve fstrans_mount_info by mount or NULL
228  */
229 static inline struct fstrans_mount_info *
230 fstrans_mount_get(struct mount *mp)
231 {
232 	uint32_t indx;
233 	struct fstrans_mount_info *fmi, *fmi_lower;
234 
235 	KASSERT(mutex_owned(&fstrans_lock));
236 
237 	indx = fstrans_mount_hash(mp);
238 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
239 		if (fmi->fmi_mount == mp) {
240 			if (__predict_false(mp->mnt_lower != NULL &&
241 			    fmi->fmi_lower_info == NULL)) {
242 				/*
243 				 * Intern the lower/lowest mount into
244 				 * this mount info on first lookup.
245 				 */
246 				KASSERT(fmi->fmi_ref_cnt == 1);
247 
248 				fmi_lower = fstrans_mount_get(mp->mnt_lower);
249 				if (fmi_lower && fmi_lower->fmi_lower_info)
250 					fmi_lower = fmi_lower->fmi_lower_info;
251 				if (fmi_lower == NULL)
252 					return NULL;
253 				fmi->fmi_lower_info = fmi_lower;
254 				fmi->fmi_lower_info->fmi_ref_cnt += 1;
255 			}
256 			return fmi;
257 		}
258 	}
259 
260 	return NULL;
261 }
262 
263 /*
264  * Dereference mount state.
265  */
266 static void
267 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
268 {
269 
270 	KASSERT(mutex_owned(&fstrans_lock));
271 
272 	KASSERT(fmi != NULL);
273 	fmi->fmi_ref_cnt -= 1;
274 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
275 		return;
276 	}
277 
278 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
279 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
280 	KASSERT(fmi->fmi_owner == NULL);
281 
282 	if (fmi->fmi_lower_info)
283 		fstrans_mount_dtor(fmi->fmi_lower_info);
284 
285 	KASSERT(fstrans_gone_count > 0);
286 	fstrans_gone_count -= 1;
287 
288 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
289 	kmem_free(fmi, sizeof(*fmi));
290 }
291 
292 /*
293  * Allocate mount state.
294  */
295 int
296 fstrans_mount(struct mount *mp)
297 {
298 	uint32_t indx;
299 	struct fstrans_mount_info *newfmi;
300 
301 	indx = fstrans_mount_hash(mp);
302 
303 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
304 	newfmi->fmi_state = FSTRANS_NORMAL;
305 	newfmi->fmi_ref_cnt = 1;
306 	newfmi->fmi_gone = false;
307 	LIST_INIT(&newfmi->fmi_cow_handler);
308 	newfmi->fmi_cow_change = false;
309 	newfmi->fmi_mount = mp;
310 	newfmi->fmi_lower_info = NULL;
311 	newfmi->fmi_owner = NULL;
312 
313 	mutex_enter(&fstrans_lock);
314 	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
315 	mutex_exit(&fstrans_lock);
316 
317 	return 0;
318 }
319 
320 /*
321  * Deallocate mount state.
322  */
323 void
324 fstrans_unmount(struct mount *mp)
325 {
326 	uint32_t indx;
327 	struct fstrans_mount_info *fmi;
328 
329 	indx = fstrans_mount_hash(mp);
330 
331 	mutex_enter(&fstrans_lock);
332 	fmi = fstrans_mount_get(mp);
333 	KASSERT(fmi != NULL);
334 	fmi->fmi_gone = true;
335 	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
336 	    fmi, fstrans_mount_info, fmi_hash);
337 	fstrans_gone_count += 1;
338 	fstrans_mount_dtor(fmi);
339 	mutex_exit(&fstrans_lock);
340 }
341 
342 /*
343  * Clear mount entries whose mount is gone.
344  */
345 static void
346 fstrans_clear_lwp_info(void)
347 {
348 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
349 
350 	/*
351 	 * Scan our list clearing entries whose mount is gone.
352 	 */
353 	mutex_enter(&fstrans_lock);
354 	for (p = &curlwp->l_fstrans; *p; ) {
355 		fli = *p;
356 		if (fli->fli_mount != NULL &&
357 		    fli->fli_mountinfo->fmi_gone &&
358 		    fli->fli_trans_cnt == 0 &&
359 		    fli->fli_cow_cnt == 0 &&
360 		    fli->fli_alias_cnt == 0) {
361 			*p = (*p)->fli_succ;
362 			fstrans_mount_dtor(fli->fli_mountinfo);
363 			if (fli->fli_alias) {
364 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
365 				fli->fli_alias->fli_alias_cnt--;
366 			}
367 			fli->fli_mount = NULL;
368 			fli->fli_alias = NULL;
369 			fli->fli_mountinfo = NULL;
370 			fli->fli_self = NULL;
371 			p = &curlwp->l_fstrans;
372 			fli->fli_succ = tofree;
373 			tofree = fli;
374 		} else {
375 			p = &(*p)->fli_succ;
376 		}
377 	}
378 #ifdef DIAGNOSTIC
379 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
380 		if (fli->fli_alias != NULL)
381 			KASSERT(fli->fli_alias->fli_self == curlwp);
382 #endif /* DIAGNOSTIC */
383 	mutex_exit(&fstrans_lock);
384 
385 	while (tofree != NULL) {
386 		fli = tofree;
387 		tofree = fli->fli_succ;
388 		pool_cache_put(fstrans_lwp_cache, fli);
389 	}
390 }
391 
392 /*
393  * Allocate and return per lwp info for this mount.
394  */
395 static struct fstrans_lwp_info *
396 fstrans_alloc_lwp_info(struct mount *mp)
397 {
398 	struct fstrans_lwp_info *fli, *fli_lower;
399 	struct fstrans_mount_info *fmi;
400 
401 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
402 		if (fli->fli_mount == mp)
403 			return fli;
404 	}
405 
406 	/*
407 	 * Lookup mount info and get lower mount per lwp info.
408 	 */
409 	mutex_enter(&fstrans_lock);
410 	fmi = fstrans_mount_get(mp);
411 	if (fmi == NULL) {
412 		mutex_exit(&fstrans_lock);
413 		return NULL;
414 	}
415 	fmi->fmi_ref_cnt += 1;
416 	mutex_exit(&fstrans_lock);
417 
418 	if (fmi->fmi_lower_info) {
419 		fli_lower =
420 		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
421 		if (fli_lower == NULL) {
422 			mutex_enter(&fstrans_lock);
423 			fstrans_mount_dtor(fmi);
424 			mutex_exit(&fstrans_lock);
425 
426 			return NULL;
427 		}
428 	} else {
429 		fli_lower = NULL;
430 	}
431 
432 	/*
433 	 * Allocate a new entry.
434 	 */
435 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
436 	KASSERT(fli->fli_trans_cnt == 0);
437 	KASSERT(fli->fli_cow_cnt == 0);
438 	KASSERT(fli->fli_alias_cnt == 0);
439 	KASSERT(fli->fli_mount == NULL);
440 	KASSERT(fli->fli_alias == NULL);
441 	KASSERT(fli->fli_mountinfo == NULL);
442 	KASSERT(fli->fli_self == NULL);
443 
444 	/*
445 	 * Attach the mount info and alias.
446 	 */
447 
448 	fli->fli_self = curlwp;
449 	fli->fli_mount = mp;
450 	fli->fli_mountinfo = fmi;
451 
452 	fli->fli_succ = curlwp->l_fstrans;
453 	curlwp->l_fstrans = fli;
454 
455 	if (fli_lower) {
456 		fli->fli_alias = fli_lower;
457 		fli->fli_alias->fli_alias_cnt++;
458 		fli = fli->fli_alias;
459 	}
460 
461 	return fli;
462 }
463 
464 /*
465  * Retrieve the per lwp info for this mount allocating if necessary.
466  */
467 static inline struct fstrans_lwp_info *
468 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
469 {
470 	struct fstrans_lwp_info *fli;
471 
472 	/*
473 	 * Scan our list for a match.
474 	 */
475 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
476 		if (fli->fli_mount == mp) {
477 			KASSERT((mp->mnt_lower == NULL) ==
478 			    (fli->fli_alias == NULL));
479 			if (fli->fli_alias != NULL)
480 				fli = fli->fli_alias;
481 			break;
482 		}
483 	}
484 
485 	if (do_alloc) {
486 		if (__predict_false(fli == NULL))
487 			fli = fstrans_alloc_lwp_info(mp);
488 	}
489 
490 	return fli;
491 }
492 
493 /*
494  * Check if this lock type is granted at this state.
495  */
496 static bool
497 grant_lock(const struct fstrans_mount_info *fmi,
498     const enum fstrans_lock_type type)
499 {
500 
501 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
502 		return true;
503 	if (fmi->fmi_owner == curlwp)
504 		return true;
505 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
506 		return true;
507 
508 	return false;
509 }
510 
511 /*
512  * Start a transaction.  If this thread already has a transaction on this
513  * file system increment the reference counter.
514  */
515 static inline int
516 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
517 {
518 	int s;
519 	struct fstrans_lwp_info *fli;
520 	struct fstrans_mount_info *fmi;
521 
522 	ASSERT_SLEEPABLE();
523 
524 	fli = fstrans_get_lwp_info(mp, true);
525 	if (fli == NULL)
526 		return 0;
527 	fmi = fli->fli_mountinfo;
528 
529 	if (fli->fli_trans_cnt > 0) {
530 		fli->fli_trans_cnt += 1;
531 
532 		return 0;
533 	}
534 
535 	s = pserialize_read_enter();
536 	if (__predict_true(grant_lock(fmi, lock_type))) {
537 		fli->fli_trans_cnt = 1;
538 		fli->fli_lock_type = lock_type;
539 		pserialize_read_exit(s);
540 
541 		return 0;
542 	}
543 	pserialize_read_exit(s);
544 
545 	if (! wait)
546 		return EBUSY;
547 
548 	mutex_enter(&fstrans_lock);
549 	while (! grant_lock(fmi, lock_type))
550 		cv_wait(&fstrans_state_cv, &fstrans_lock);
551 	fli->fli_trans_cnt = 1;
552 	fli->fli_lock_type = lock_type;
553 	mutex_exit(&fstrans_lock);
554 
555 	return 0;
556 }
557 
558 void
559 fstrans_start(struct mount *mp)
560 {
561 	int error __diagused;
562 
563 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
564 	KASSERT(error == 0);
565 }
566 
567 int
568 fstrans_start_nowait(struct mount *mp)
569 {
570 
571 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
572 }
573 
574 void
575 fstrans_start_lazy(struct mount *mp)
576 {
577 	int error __diagused;
578 
579 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
580 	KASSERT(error == 0);
581 }
582 
583 /*
584  * Finish a transaction.
585  */
586 void
587 fstrans_done(struct mount *mp)
588 {
589 	int s;
590 	struct fstrans_lwp_info *fli;
591 	struct fstrans_mount_info *fmi;
592 
593 	fli = fstrans_get_lwp_info(mp, false);
594 	if (fli == NULL)
595 		return;
596 	fmi = fli->fli_mountinfo;
597 	KASSERT(fli->fli_trans_cnt > 0);
598 
599 	if (fli->fli_trans_cnt > 1) {
600 		fli->fli_trans_cnt -= 1;
601 
602 		return;
603 	}
604 
605 	if (__predict_false(fstrans_gone_count > 0))
606 		fstrans_clear_lwp_info();
607 
608 	s = pserialize_read_enter();
609 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
610 		fli->fli_trans_cnt = 0;
611 		pserialize_read_exit(s);
612 
613 		return;
614 	}
615 	pserialize_read_exit(s);
616 
617 	mutex_enter(&fstrans_lock);
618 	fli->fli_trans_cnt = 0;
619 	cv_signal(&fstrans_count_cv);
620 	mutex_exit(&fstrans_lock);
621 }
622 
623 /*
624  * Check if we hold an lock.
625  */
626 int
627 fstrans_held(struct mount *mp)
628 {
629 	struct fstrans_lwp_info *fli;
630 	struct fstrans_mount_info *fmi;
631 
632 	KASSERT(mp != dead_rootmount);
633 
634 	fli = fstrans_get_lwp_info(mp, false);
635 	if (fli == NULL)
636 		return 0;
637 	fmi = fli->fli_mountinfo;
638 
639 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
640 }
641 
642 /*
643  * Check if this thread has an exclusive lock.
644  */
645 int
646 fstrans_is_owner(struct mount *mp)
647 {
648 	struct fstrans_lwp_info *fli;
649 	struct fstrans_mount_info *fmi;
650 
651 	KASSERT(mp != dead_rootmount);
652 
653 	fli = fstrans_get_lwp_info(mp, false);
654 	if (fli == NULL)
655 		return 0;
656 	fmi = fli->fli_mountinfo;
657 
658 	return (fmi->fmi_owner == curlwp);
659 }
660 
661 /*
662  * True, if no thread is in a transaction not granted at the current state.
663  */
664 static bool
665 state_change_done(const struct fstrans_mount_info *fmi)
666 {
667 	struct fstrans_lwp_info *fli;
668 
669 	KASSERT(mutex_owned(&fstrans_lock));
670 
671 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
672 		if (fli->fli_mountinfo != fmi)
673 			continue;
674 		if (fli->fli_trans_cnt == 0)
675 			continue;
676 		if (fli->fli_self == curlwp)
677 			continue;
678 		if (grant_lock(fmi, fli->fli_lock_type))
679 			continue;
680 
681 		return false;
682 	}
683 
684 	return true;
685 }
686 
687 /*
688  * Set new file system state.
689  */
690 int
691 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
692 {
693 	int error;
694 	enum fstrans_state old_state;
695 	struct fstrans_lwp_info *fli;
696 	struct fstrans_mount_info *fmi;
697 
698 	KASSERT(mp != dead_rootmount);
699 
700 	fli = fstrans_get_lwp_info(mp, true);
701 	if (fli == NULL)
702 		return ENOENT;
703 	fmi = fli->fli_mountinfo;
704 	old_state = fmi->fmi_state;
705 	if (old_state == new_state)
706 		return 0;
707 
708 	mutex_enter(&fstrans_lock);
709 	fmi->fmi_state = new_state;
710 	pserialize_perform(fstrans_psz);
711 
712 	/*
713 	 * All threads see the new state now.
714 	 * Wait for transactions invalid at this state to leave.
715 	 */
716 	error = 0;
717 	while (! state_change_done(fmi)) {
718 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
719 		if (error) {
720 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
721 			break;
722 		}
723 	}
724 	if (old_state != new_state) {
725 		if (old_state == FSTRANS_NORMAL) {
726 			KASSERT(fmi->fmi_owner == NULL);
727 			fmi->fmi_owner = curlwp;
728 		}
729 		if (new_state == FSTRANS_NORMAL) {
730 			KASSERT(fmi->fmi_owner == curlwp);
731 			fmi->fmi_owner = NULL;
732 		}
733 	}
734 	cv_broadcast(&fstrans_state_cv);
735 	mutex_exit(&fstrans_lock);
736 
737 	return error;
738 }
739 
740 /*
741  * Get current file system state.
742  */
743 enum fstrans_state
744 fstrans_getstate(struct mount *mp)
745 {
746 	struct fstrans_lwp_info *fli;
747 	struct fstrans_mount_info *fmi;
748 
749 	KASSERT(mp != dead_rootmount);
750 
751 	fli = fstrans_get_lwp_info(mp, true);
752 	KASSERT(fli != NULL);
753 	fmi = fli->fli_mountinfo;
754 
755 	return fmi->fmi_state;
756 }
757 
758 /*
759  * Request a filesystem to suspend all operations.
760  */
761 int
762 vfs_suspend(struct mount *mp, int nowait)
763 {
764 	struct fstrans_lwp_info *fli;
765 	int error;
766 
767 	if (mp == dead_rootmount)
768 		return EOPNOTSUPP;
769 
770 	fli = fstrans_get_lwp_info(mp, true);
771 	if (fli == NULL)
772 		return ENOENT;
773 
774 	if (nowait) {
775 		if (!mutex_tryenter(&vfs_suspend_lock))
776 			return EWOULDBLOCK;
777 	} else
778 		mutex_enter(&vfs_suspend_lock);
779 
780 	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
781 		mutex_exit(&vfs_suspend_lock);
782 		return error;
783 	}
784 
785 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
786 		vfs_resume(mp);
787 		return ENOENT;
788 	}
789 
790 	return 0;
791 }
792 
793 /*
794  * Request a filesystem to resume all operations.
795  */
796 void
797 vfs_resume(struct mount *mp)
798 {
799 	struct fstrans_lwp_info *fli;
800 
801 	KASSERT(mp != dead_rootmount);
802 
803 	fli = fstrans_get_lwp_info(mp, false);
804 	mp = fli->fli_mount;
805 
806 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
807 	mutex_exit(&vfs_suspend_lock);
808 }
809 
810 
811 /*
812  * True, if no thread is running a cow handler.
813  */
814 static bool
815 cow_state_change_done(const struct fstrans_mount_info *fmi)
816 {
817 	struct fstrans_lwp_info *fli;
818 
819 	KASSERT(mutex_owned(&fstrans_lock));
820 	KASSERT(fmi->fmi_cow_change);
821 
822 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
823 		if (fli->fli_mount != fmi->fmi_mount)
824 			continue;
825 		if (fli->fli_cow_cnt == 0)
826 			continue;
827 
828 		return false;
829 	}
830 
831 	return true;
832 }
833 
834 /*
835  * Prepare for changing this mounts cow list.
836  * Returns with fstrans_lock locked.
837  */
838 static void
839 cow_change_enter(struct fstrans_mount_info *fmi)
840 {
841 
842 	mutex_enter(&fstrans_lock);
843 
844 	/*
845 	 * Wait for other threads changing the list.
846 	 */
847 	while (fmi->fmi_cow_change)
848 		cv_wait(&fstrans_state_cv, &fstrans_lock);
849 
850 	/*
851 	 * Wait until all threads are aware of a state change.
852 	 */
853 	fmi->fmi_cow_change = true;
854 	pserialize_perform(fstrans_psz);
855 
856 	while (! cow_state_change_done(fmi))
857 		cv_wait(&fstrans_count_cv, &fstrans_lock);
858 }
859 
860 /*
861  * Done changing this mounts cow list.
862  */
863 static void
864 cow_change_done(struct fstrans_mount_info *fmi)
865 {
866 
867 	KASSERT(mutex_owned(&fstrans_lock));
868 
869 	fmi->fmi_cow_change = false;
870 	pserialize_perform(fstrans_psz);
871 
872 	cv_broadcast(&fstrans_state_cv);
873 
874 	mutex_exit(&fstrans_lock);
875 }
876 
877 /*
878  * Add a handler to this mount.
879  */
880 int
881 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
882     void *arg)
883 {
884 	struct fstrans_mount_info *fmi;
885 	struct fscow_handler *newch;
886 
887 	KASSERT(mp != dead_rootmount);
888 
889 	mutex_enter(&fstrans_lock);
890 	fmi = fstrans_mount_get(mp);
891 	KASSERT(fmi != NULL);
892 	fmi->fmi_ref_cnt += 1;
893 	mutex_exit(&fstrans_lock);
894 
895 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
896 	newch->ch_func = func;
897 	newch->ch_arg = arg;
898 
899 	cow_change_enter(fmi);
900 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
901 	cow_change_done(fmi);
902 
903 	return 0;
904 }
905 
906 /*
907  * Remove a handler from this mount.
908  */
909 int
910 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
911     void *arg)
912 {
913 	struct fstrans_mount_info *fmi;
914 	struct fscow_handler *hp = NULL;
915 
916 	KASSERT(mp != dead_rootmount);
917 
918 	mutex_enter(&fstrans_lock);
919 	fmi = fstrans_mount_get(mp);
920 	KASSERT(fmi != NULL);
921 	mutex_exit(&fstrans_lock);
922 
923 	cow_change_enter(fmi);
924 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
925 		if (hp->ch_func == func && hp->ch_arg == arg)
926 			break;
927 	if (hp != NULL) {
928 		LIST_REMOVE(hp, ch_list);
929 		kmem_free(hp, sizeof(*hp));
930 	}
931 	fstrans_mount_dtor(fmi);
932 	cow_change_done(fmi);
933 
934 	return hp ? 0 : EINVAL;
935 }
936 
937 /*
938  * Check for need to copy block that is about to be written.
939  */
940 int
941 fscow_run(struct buf *bp, bool data_valid)
942 {
943 	int error, s;
944 	struct mount *mp;
945 	struct fstrans_lwp_info *fli;
946 	struct fstrans_mount_info *fmi;
947 	struct fscow_handler *hp;
948 
949 	/*
950 	 * First check if we need run the copy-on-write handler.
951 	 */
952 	if ((bp->b_flags & B_COWDONE))
953 		return 0;
954 	if (bp->b_vp == NULL) {
955 		bp->b_flags |= B_COWDONE;
956 		return 0;
957 	}
958 	if (bp->b_vp->v_type == VBLK)
959 		mp = spec_node_getmountedfs(bp->b_vp);
960 	else
961 		mp = bp->b_vp->v_mount;
962 	if (mp == NULL || mp == dead_rootmount) {
963 		bp->b_flags |= B_COWDONE;
964 		return 0;
965 	}
966 
967 	fli = fstrans_get_lwp_info(mp, true);
968 	KASSERT(fli != NULL);
969 	fmi = fli->fli_mountinfo;
970 
971 	/*
972 	 * On non-recursed run check if other threads
973 	 * want to change the list.
974 	 */
975 	if (fli->fli_cow_cnt == 0) {
976 		s = pserialize_read_enter();
977 		if (__predict_false(fmi->fmi_cow_change)) {
978 			pserialize_read_exit(s);
979 			mutex_enter(&fstrans_lock);
980 			while (fmi->fmi_cow_change)
981 				cv_wait(&fstrans_state_cv, &fstrans_lock);
982 			fli->fli_cow_cnt = 1;
983 			mutex_exit(&fstrans_lock);
984 		} else {
985 			fli->fli_cow_cnt = 1;
986 			pserialize_read_exit(s);
987 		}
988 	} else
989 		fli->fli_cow_cnt += 1;
990 
991 	/*
992 	 * Run all copy-on-write handlers, stop on error.
993 	 */
994 	error = 0;
995 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
996 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
997 			break;
998  	if (error == 0)
999  		bp->b_flags |= B_COWDONE;
1000 
1001 	/*
1002 	 * Check if other threads want to change the list.
1003 	 */
1004 	if (fli->fli_cow_cnt > 1) {
1005 		fli->fli_cow_cnt -= 1;
1006 	} else {
1007 		s = pserialize_read_enter();
1008 		if (__predict_false(fmi->fmi_cow_change)) {
1009 			pserialize_read_exit(s);
1010 			mutex_enter(&fstrans_lock);
1011 			fli->fli_cow_cnt = 0;
1012 			cv_signal(&fstrans_count_cv);
1013 			mutex_exit(&fstrans_lock);
1014 		} else {
1015 			fli->fli_cow_cnt = 0;
1016 			pserialize_read_exit(s);
1017 		}
1018 	}
1019 
1020 	return error;
1021 }
1022 
1023 #if defined(DDB)
1024 void fstrans_dump(int);
1025 
1026 static void
1027 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1028 {
1029 	char prefix[9];
1030 	struct fstrans_lwp_info *fli;
1031 
1032 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1033 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1034 		if (fli->fli_self != l)
1035 			continue;
1036 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1037 			if (! verbose)
1038 				continue;
1039 		}
1040 		printf("%-8s", prefix);
1041 		if (verbose)
1042 			printf(" @%p", fli);
1043 		if (fli->fli_mount == dead_rootmount)
1044 			printf(" <dead>");
1045 		else if (fli->fli_mount != NULL)
1046 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1047 		else
1048 			printf(" NULL");
1049 		if (fli->fli_alias != NULL) {
1050 			struct mount *amp = fli->fli_alias->fli_mount;
1051 
1052 			printf(" alias");
1053 			if (verbose)
1054 				printf(" @%p", fli->fli_alias);
1055 			if (amp == NULL)
1056 				printf(" NULL");
1057 			else
1058 				printf(" (%s)", amp->mnt_stat.f_mntonname);
1059 		}
1060 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1061 			printf(" gone");
1062 		if (fli->fli_trans_cnt == 0) {
1063 			printf(" -");
1064 		} else {
1065 			switch (fli->fli_lock_type) {
1066 			case FSTRANS_LAZY:
1067 				printf(" lazy");
1068 				break;
1069 			case FSTRANS_SHARED:
1070 				printf(" shared");
1071 				break;
1072 			default:
1073 				printf(" %#x", fli->fli_lock_type);
1074 				break;
1075 			}
1076 		}
1077 		printf(" %d cow %d alias %d\n",
1078 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1079 		prefix[0] = '\0';
1080 	}
1081 }
1082 
1083 static void
1084 fstrans_print_mount(struct mount *mp, int verbose)
1085 {
1086 	uint32_t indx;
1087 	struct fstrans_mount_info *fmi;
1088 
1089 	indx = fstrans_mount_hash(mp);
1090 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1091 		if (fmi->fmi_mount == mp)
1092 			break;
1093 
1094 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1095 		return;
1096 
1097 	printf("%-16s ", mp->mnt_stat.f_mntonname);
1098 	if (fmi == NULL) {
1099 		printf("(null)\n");
1100 		return;
1101 	}
1102 	printf("owner %p ", fmi->fmi_owner);
1103 	switch (fmi->fmi_state) {
1104 	case FSTRANS_NORMAL:
1105 		printf("state normal\n");
1106 		break;
1107 	case FSTRANS_SUSPENDING:
1108 		printf("state suspending\n");
1109 		break;
1110 	case FSTRANS_SUSPENDED:
1111 		printf("state suspended\n");
1112 		break;
1113 	default:
1114 		printf("state %#x\n", fmi->fmi_state);
1115 		break;
1116 	}
1117 }
1118 
1119 void
1120 fstrans_dump(int full)
1121 {
1122 	const struct proclist_desc *pd;
1123 	struct proc *p;
1124 	struct lwp *l;
1125 	struct mount *mp;
1126 
1127 	printf("Fstrans locks by lwp:\n");
1128 	for (pd = proclists; pd->pd_list != NULL; pd++)
1129 		PROCLIST_FOREACH(p, pd->pd_list)
1130 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1131 				fstrans_print_lwp(p, l, full == 1);
1132 
1133 	printf("Fstrans state by mount:\n");
1134 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1135 		fstrans_print_mount(mp, full == 1);
1136 }
1137 #endif /* defined(DDB) */
1138