xref: /netbsd-src/sys/ufs/ffs/ffs_alloc.c (revision fc4f42693f9b1c31f39f9cf50af1bf2010325808)
1 /*	$NetBSD: ffs_alloc.c,v 1.159 2017/12/07 21:53:41 chs Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Wasabi Systems, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2002 Networks Associates Technology, Inc.
34  * All rights reserved.
35  *
36  * This software was developed for the FreeBSD Project by Marshall
37  * Kirk McKusick and Network Associates Laboratories, the Security
38  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
39  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
40  * research program
41  *
42  * Copyright (c) 1982, 1986, 1989, 1993
43  *	The Regents of the University of California.  All rights reserved.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. Neither the name of the University nor the names of its contributors
54  *    may be used to endorse or promote products derived from this software
55  *    without specific prior written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67  * SUCH DAMAGE.
68  *
69  *	@(#)ffs_alloc.c	8.19 (Berkeley) 7/13/95
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.159 2017/12/07 21:53:41 chs Exp $");
74 
75 #if defined(_KERNEL_OPT)
76 #include "opt_ffs.h"
77 #include "opt_quota.h"
78 #include "opt_uvm_page_trkown.h"
79 #endif
80 
81 #include <sys/param.h>
82 #include <sys/systm.h>
83 #include <sys/buf.h>
84 #include <sys/cprng.h>
85 #include <sys/kauth.h>
86 #include <sys/kernel.h>
87 #include <sys/mount.h>
88 #include <sys/proc.h>
89 #include <sys/syslog.h>
90 #include <sys/vnode.h>
91 #include <sys/wapbl.h>
92 #include <sys/cprng.h>
93 
94 #include <miscfs/specfs/specdev.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/ufsmount.h>
97 #include <ufs/ufs/inode.h>
98 #include <ufs/ufs/ufs_extern.h>
99 #include <ufs/ufs/ufs_bswap.h>
100 #include <ufs/ufs/ufs_wapbl.h>
101 
102 #include <ufs/ffs/fs.h>
103 #include <ufs/ffs/ffs_extern.h>
104 
105 #ifdef UVM_PAGE_TRKOWN
106 #include <uvm/uvm.h>
107 #endif
108 
109 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int, int);
110 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int);
111 static ino_t ffs_dirpref(struct inode *);
112 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
113 static void ffs_fserr(struct fs *, kauth_cred_t, const char *);
114 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, int,
115     daddr_t (*)(struct inode *, int, daddr_t, int, int, int));
116 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int, int);
117 static int32_t ffs_mapsearch(struct fs *, struct cg *,
118 				      daddr_t, int);
119 static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
120     daddr_t, long, bool);
121 static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
122     int, bool);
123 
124 /* if 1, changes in optimalization strategy are logged */
125 int ffs_log_changeopt = 0;
126 
127 /* in ffs_tables.c */
128 extern const int inside[], around[];
129 extern const u_char * const fragtbl[];
130 
131 /* Basic consistency check for block allocations */
132 static int
133 ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
134     long size, dev_t dev, ino_t inum)
135 {
136 	if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
137 	    ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
138 		panic("%s: bad size: dev = 0x%llx, bno = %" PRId64
139 		    " bsize = %d, size = %ld, fs = %s", func,
140 		    (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
141 	}
142 
143 	if (bno >= fs->fs_size) {
144 		printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
145 		    (unsigned long long)inum);
146 		ffs_fserr(fs, NOCRED, "bad block");
147 		return EINVAL;
148 	}
149 	return 0;
150 }
151 
152 /*
153  * Allocate a block in the file system.
154  *
155  * The size of the requested block is given, which must be some
156  * multiple of fs_fsize and <= fs_bsize.
157  * A preference may be optionally specified. If a preference is given
158  * the following hierarchy is used to allocate a block:
159  *   1) allocate the requested block.
160  *   2) allocate a rotationally optimal block in the same cylinder.
161  *   3) allocate a block in the same cylinder group.
162  *   4) quadradically rehash into other cylinder groups, until an
163  *      available block is located.
164  * If no block preference is given the following hierarchy is used
165  * to allocate a block:
166  *   1) allocate a block in the cylinder group that contains the
167  *      inode for the file.
168  *   2) quadradically rehash into other cylinder groups, until an
169  *      available block is located.
170  *
171  * => called with um_lock held
172  * => releases um_lock before returning
173  */
174 int
175 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
176     int flags, kauth_cred_t cred, daddr_t *bnp)
177 {
178 	struct ufsmount *ump;
179 	struct fs *fs;
180 	daddr_t bno;
181 	int cg;
182 #if defined(QUOTA) || defined(QUOTA2)
183 	int error;
184 #endif
185 
186 	fs = ip->i_fs;
187 	ump = ip->i_ump;
188 
189 	KASSERT(mutex_owned(&ump->um_lock));
190 
191 #ifdef UVM_PAGE_TRKOWN
192 
193 	/*
194 	 * Sanity-check that allocations within the file size
195 	 * do not allow other threads to read the stale contents
196 	 * of newly allocated blocks.
197 	 * Usually pages will exist to cover the new allocation.
198 	 * There is an optimization in ffs_write() where we skip
199 	 * creating pages if several conditions are met:
200 	 *  - the file must not be mapped (in any user address space).
201 	 *  - the write must cover whole pages and whole blocks.
202 	 * If those conditions are not met then pages must exist and
203 	 * be locked by the current thread.
204 	 */
205 
206 	struct vnode *vp = ITOV(ip);
207 	if (vp->v_type == VREG &&
208 	    ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) &&
209 	    ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 ||
210 	     ffs_blkoff(fs, size) != 0)) {
211 		struct vm_page *pg;
212 		struct uvm_object *uobj = &vp->v_uobj;
213 		voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
214 		voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
215 
216 		mutex_enter(uobj->vmobjlock);
217 		while (off < endoff) {
218 			pg = uvm_pagelookup(uobj, off);
219 			KASSERT((pg != NULL && pg->owner_tag != NULL &&
220 				 pg->owner == curproc->p_pid &&
221 				 pg->lowner == curlwp->l_lid));
222 			off += PAGE_SIZE;
223 		}
224 		mutex_exit(uobj->vmobjlock);
225 	}
226 #endif
227 
228 	*bnp = 0;
229 
230 	KASSERTMSG((cred != NOCRED), "missing credential");
231 	KASSERTMSG(((u_int)size <= fs->fs_bsize),
232 	    "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
233 	    (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
234 	KASSERTMSG((ffs_fragoff(fs, size) == 0),
235 	    "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
236 	    (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
237 
238 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
239 		goto nospace;
240 	if (freespace(fs, fs->fs_minfree) <= 0 &&
241 	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
242 	    NULL, NULL) != 0)
243 		goto nospace;
244 #if defined(QUOTA) || defined(QUOTA2)
245 	mutex_exit(&ump->um_lock);
246 	if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
247 		return (error);
248 	mutex_enter(&ump->um_lock);
249 #endif
250 
251 	if (bpref >= fs->fs_size)
252 		bpref = 0;
253 	if (bpref == 0)
254 		cg = ino_to_cg(fs, ip->i_number);
255 	else
256 		cg = dtog(fs, bpref);
257 	bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg);
258 	if (bno > 0) {
259 		DIP_ADD(ip, blocks, btodb(size));
260 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
261 		*bnp = bno;
262 		return (0);
263 	}
264 #if defined(QUOTA) || defined(QUOTA2)
265 	/*
266 	 * Restore user's disk quota because allocation failed.
267 	 */
268 	(void) chkdq(ip, -btodb(size), cred, FORCE);
269 #endif
270 	if (flags & B_CONTIG) {
271 		/*
272 		 * XXX ump->um_lock handling is "suspect" at best.
273 		 * For the case where ffs_hashalloc() fails early
274 		 * in the B_CONTIG case we reach here with um_lock
275 		 * already unlocked, so we can't release it again
276 		 * like in the normal error path.  See kern/39206.
277 		 *
278 		 *
279 		 * Fail silently - it's up to our caller to report
280 		 * errors.
281 		 */
282 		return (ENOSPC);
283 	}
284 nospace:
285 	mutex_exit(&ump->um_lock);
286 	ffs_fserr(fs, cred, "file system full");
287 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
288 	return (ENOSPC);
289 }
290 
291 /*
292  * Reallocate a fragment to a bigger size
293  *
294  * The number and size of the old block is given, and a preference
295  * and new size is also specified. The allocator attempts to extend
296  * the original block. Failing that, the regular block allocator is
297  * invoked to get an appropriate block.
298  *
299  * => called with um_lock held
300  * => return with um_lock released
301  */
302 int
303 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
304     int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
305 {
306 	struct ufsmount *ump;
307 	struct fs *fs;
308 	struct buf *bp;
309 	int cg, request, error;
310 	daddr_t bprev, bno;
311 
312 	fs = ip->i_fs;
313 	ump = ip->i_ump;
314 
315 	KASSERT(mutex_owned(&ump->um_lock));
316 
317 #ifdef UVM_PAGE_TRKOWN
318 
319 	/*
320 	 * Sanity-check that allocations within the file size
321 	 * do not allow other threads to read the stale contents
322 	 * of newly allocated blocks.
323 	 * Unlike in ffs_alloc(), here pages must always exist
324 	 * for such allocations, because only the last block of a file
325 	 * can be a fragment and ffs_write() will reallocate the
326 	 * fragment to the new size using ufs_balloc_range(),
327 	 * which always creates pages to cover blocks it allocates.
328 	 */
329 
330 	if (ITOV(ip)->v_type == VREG) {
331 		struct vm_page *pg;
332 		struct uvm_object *uobj = &ITOV(ip)->v_uobj;
333 		voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
334 		voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
335 
336 		mutex_enter(uobj->vmobjlock);
337 		while (off < endoff) {
338 			pg = uvm_pagelookup(uobj, off);
339 			KASSERT(pg->owner == curproc->p_pid &&
340 				pg->lowner == curlwp->l_lid);
341 			off += PAGE_SIZE;
342 		}
343 		mutex_exit(uobj->vmobjlock);
344 	}
345 #endif
346 
347 	KASSERTMSG((cred != NOCRED), "missing credential");
348 	KASSERTMSG(((u_int)osize <= fs->fs_bsize),
349 	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
350 	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
351 	    fs->fs_fsmnt);
352 	KASSERTMSG((ffs_fragoff(fs, osize) == 0),
353 	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
354 	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
355 	    fs->fs_fsmnt);
356 	KASSERTMSG(((u_int)nsize <= fs->fs_bsize),
357 	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
358 	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
359 	    fs->fs_fsmnt);
360 	KASSERTMSG((ffs_fragoff(fs, nsize) == 0),
361 	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
362 	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
363 	    fs->fs_fsmnt);
364 
365 	if (freespace(fs, fs->fs_minfree) <= 0 &&
366 	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
367 	    NULL, NULL) != 0) {
368 		mutex_exit(&ump->um_lock);
369 		goto nospace;
370 	}
371 	if (fs->fs_magic == FS_UFS2_MAGIC)
372 		bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
373 	else
374 		bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
375 
376 	if (bprev == 0) {
377 		panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
378 		    PRId64 ", fs = %s", __func__,
379 		    (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
380 		    fs->fs_fsmnt);
381 	}
382 	mutex_exit(&ump->um_lock);
383 
384 	/*
385 	 * Allocate the extra space in the buffer.
386 	 */
387 	if (bpp != NULL &&
388 	    (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) {
389 		return (error);
390 	}
391 #if defined(QUOTA) || defined(QUOTA2)
392 	if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
393 		if (bpp != NULL) {
394 			brelse(bp, 0);
395 		}
396 		return (error);
397 	}
398 #endif
399 	/*
400 	 * Check for extension in the existing location.
401 	 */
402 	cg = dtog(fs, bprev);
403 	mutex_enter(&ump->um_lock);
404 	if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
405 		DIP_ADD(ip, blocks, btodb(nsize - osize));
406 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
407 
408 		if (bpp != NULL) {
409 			if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
410 				panic("%s: bad blockno %#llx != %#llx",
411 				    __func__, (unsigned long long) bp->b_blkno,
412 				    (unsigned long long)FFS_FSBTODB(fs, bno));
413 			}
414 			allocbuf(bp, nsize, 1);
415 			memset((char *)bp->b_data + osize, 0, nsize - osize);
416 			mutex_enter(bp->b_objlock);
417 			KASSERT(!cv_has_waiters(&bp->b_done));
418 			bp->b_oflags |= BO_DONE;
419 			mutex_exit(bp->b_objlock);
420 			*bpp = bp;
421 		}
422 		if (blknop != NULL) {
423 			*blknop = bno;
424 		}
425 		return (0);
426 	}
427 	/*
428 	 * Allocate a new disk location.
429 	 */
430 	if (bpref >= fs->fs_size)
431 		bpref = 0;
432 	switch ((int)fs->fs_optim) {
433 	case FS_OPTSPACE:
434 		/*
435 		 * Allocate an exact sized fragment. Although this makes
436 		 * best use of space, we will waste time relocating it if
437 		 * the file continues to grow. If the fragmentation is
438 		 * less than half of the minimum free reserve, we choose
439 		 * to begin optimizing for time.
440 		 */
441 		request = nsize;
442 		if (fs->fs_minfree < 5 ||
443 		    fs->fs_cstotal.cs_nffree >
444 		    fs->fs_dsize * fs->fs_minfree / (2 * 100))
445 			break;
446 
447 		if (ffs_log_changeopt) {
448 			log(LOG_NOTICE,
449 				"%s: optimization changed from SPACE to TIME\n",
450 				fs->fs_fsmnt);
451 		}
452 
453 		fs->fs_optim = FS_OPTTIME;
454 		break;
455 	case FS_OPTTIME:
456 		/*
457 		 * At this point we have discovered a file that is trying to
458 		 * grow a small fragment to a larger fragment. To save time,
459 		 * we allocate a full sized block, then free the unused portion.
460 		 * If the file continues to grow, the `ffs_fragextend' call
461 		 * above will be able to grow it in place without further
462 		 * copying. If aberrant programs cause disk fragmentation to
463 		 * grow within 2% of the free reserve, we choose to begin
464 		 * optimizing for space.
465 		 */
466 		request = fs->fs_bsize;
467 		if (fs->fs_cstotal.cs_nffree <
468 		    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
469 			break;
470 
471 		if (ffs_log_changeopt) {
472 			log(LOG_NOTICE,
473 				"%s: optimization changed from TIME to SPACE\n",
474 				fs->fs_fsmnt);
475 		}
476 
477 		fs->fs_optim = FS_OPTSPACE;
478 		break;
479 	default:
480 		panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
481 		    __func__, (unsigned long long)ip->i_dev, fs->fs_optim,
482 		    fs->fs_fsmnt);
483 		/* NOTREACHED */
484 	}
485 	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg);
486 	if (bno > 0) {
487 		/*
488 		 * Use forced deallocation registration, we can't handle
489 		 * failure here. This is safe, as this place is ever hit
490 		 * maximum once per write operation, when fragment is extended
491 		 * to longer fragment, or a full block.
492 		 */
493 		if ((ip->i_ump->um_mountp->mnt_wapbl) &&
494 		    (ITOV(ip)->v_type != VREG)) {
495 			/* this should never fail */
496 			error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
497 			    ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
498 			    osize);
499 			if (error)
500 				panic("ffs_realloccg: dealloc registration failed");
501 		} else {
502 			ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
503 			    ip->i_number);
504 		}
505 		DIP_ADD(ip, blocks, btodb(nsize - osize));
506 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
507 		if (bpp != NULL) {
508 			bp->b_blkno = FFS_FSBTODB(fs, bno);
509 			allocbuf(bp, nsize, 1);
510 			memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
511 			mutex_enter(bp->b_objlock);
512 			KASSERT(!cv_has_waiters(&bp->b_done));
513 			bp->b_oflags |= BO_DONE;
514 			mutex_exit(bp->b_objlock);
515 			*bpp = bp;
516 		}
517 		if (blknop != NULL) {
518 			*blknop = bno;
519 		}
520 		return (0);
521 	}
522 	mutex_exit(&ump->um_lock);
523 
524 #if defined(QUOTA) || defined(QUOTA2)
525 	/*
526 	 * Restore user's disk quota because allocation failed.
527 	 */
528 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
529 #endif
530 	if (bpp != NULL) {
531 		brelse(bp, 0);
532 	}
533 
534 nospace:
535 	/*
536 	 * no space available
537 	 */
538 	ffs_fserr(fs, cred, "file system full");
539 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
540 	return (ENOSPC);
541 }
542 
543 /*
544  * Allocate an inode in the file system.
545  *
546  * If allocating a directory, use ffs_dirpref to select the inode.
547  * If allocating in a directory, the following hierarchy is followed:
548  *   1) allocate the preferred inode.
549  *   2) allocate an inode in the same cylinder group.
550  *   3) quadradically rehash into other cylinder groups, until an
551  *      available inode is located.
552  * If no inode preference is given the following hierarchy is used
553  * to allocate an inode:
554  *   1) allocate an inode in cylinder group 0.
555  *   2) quadradically rehash into other cylinder groups, until an
556  *      available inode is located.
557  *
558  * => um_lock not held upon entry or return
559  */
560 int
561 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop)
562 {
563 	struct ufsmount *ump;
564 	struct inode *pip;
565 	struct fs *fs;
566 	ino_t ino, ipref;
567 	int cg, error;
568 
569 	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
570 
571 	pip = VTOI(pvp);
572 	fs = pip->i_fs;
573 	ump = pip->i_ump;
574 
575 	error = UFS_WAPBL_BEGIN(pvp->v_mount);
576 	if (error) {
577 		return error;
578 	}
579 	mutex_enter(&ump->um_lock);
580 	if (fs->fs_cstotal.cs_nifree == 0)
581 		goto noinodes;
582 
583 	if ((mode & IFMT) == IFDIR)
584 		ipref = ffs_dirpref(pip);
585 	else
586 		ipref = pip->i_number;
587 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
588 		ipref = 0;
589 	cg = ino_to_cg(fs, ipref);
590 	/*
591 	 * Track number of dirs created one after another
592 	 * in a same cg without intervening by files.
593 	 */
594 	if ((mode & IFMT) == IFDIR) {
595 		if (fs->fs_contigdirs[cg] < 255)
596 			fs->fs_contigdirs[cg]++;
597 	} else {
598 		if (fs->fs_contigdirs[cg] > 0)
599 			fs->fs_contigdirs[cg]--;
600 	}
601 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg);
602 	if (ino == 0)
603 		goto noinodes;
604 	UFS_WAPBL_END(pvp->v_mount);
605 	*inop = ino;
606 	return 0;
607 
608 noinodes:
609 	mutex_exit(&ump->um_lock);
610 	UFS_WAPBL_END(pvp->v_mount);
611 	ffs_fserr(fs, cred, "out of inodes");
612 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
613 	return ENOSPC;
614 }
615 
616 /*
617  * Find a cylinder group in which to place a directory.
618  *
619  * The policy implemented by this algorithm is to allocate a
620  * directory inode in the same cylinder group as its parent
621  * directory, but also to reserve space for its files inodes
622  * and data. Restrict the number of directories which may be
623  * allocated one after another in the same cylinder group
624  * without intervening allocation of files.
625  *
626  * If we allocate a first level directory then force allocation
627  * in another cylinder group.
628  */
629 static ino_t
630 ffs_dirpref(struct inode *pip)
631 {
632 	register struct fs *fs;
633 	int cg, prefcg;
634 	int64_t dirsize, cgsize, curdsz;
635 	int avgifree, avgbfree, avgndir;
636 	int minifree, minbfree, maxndir;
637 	int mincg, minndir;
638 	int maxcontigdirs;
639 
640 	KASSERT(mutex_owned(&pip->i_ump->um_lock));
641 
642 	fs = pip->i_fs;
643 
644 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
645 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
646 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
647 
648 	/*
649 	 * Force allocation in another cg if creating a first level dir.
650 	 */
651 	if (ITOV(pip)->v_vflag & VV_ROOT) {
652 		prefcg = cprng_fast32() % fs->fs_ncg;
653 		mincg = prefcg;
654 		minndir = fs->fs_ipg;
655 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
656 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
657 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
658 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
659 				mincg = cg;
660 				minndir = fs->fs_cs(fs, cg).cs_ndir;
661 			}
662 		for (cg = 0; cg < prefcg; cg++)
663 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
664 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
665 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
666 				mincg = cg;
667 				minndir = fs->fs_cs(fs, cg).cs_ndir;
668 			}
669 		return ((ino_t)(fs->fs_ipg * mincg));
670 	}
671 
672 	/*
673 	 * Count various limits which used for
674 	 * optimal allocation of a directory inode.
675 	 * Try cylinder groups with >75% avgifree and avgbfree.
676 	 * Avoid cylinder groups with no free blocks or inodes as that
677 	 * triggers an I/O-expensive cylinder group scan.
678 	 */
679 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
680 	minifree = avgifree - avgifree / 4;
681 	if (minifree < 1)
682 		minifree = 1;
683 	minbfree = avgbfree - avgbfree / 4;
684 	if (minbfree < 1)
685 		minbfree = 1;
686 	cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
687 	dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
688 	if (avgndir != 0) {
689 		curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
690 		if (dirsize < curdsz)
691 			dirsize = curdsz;
692 	}
693 	if (cgsize < dirsize * 255)
694 		maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
695 	else
696 		maxcontigdirs = 255;
697 	if (fs->fs_avgfpdir > 0)
698 		maxcontigdirs = min(maxcontigdirs,
699 				    fs->fs_ipg / fs->fs_avgfpdir);
700 	if (maxcontigdirs == 0)
701 		maxcontigdirs = 1;
702 
703 	/*
704 	 * Limit number of dirs in one cg and reserve space for
705 	 * regular files, but only if we have no deficit in
706 	 * inodes or space.
707 	 */
708 	prefcg = ino_to_cg(fs, pip->i_number);
709 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
710 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
711 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
712 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
713 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
714 				return ((ino_t)(fs->fs_ipg * cg));
715 		}
716 	for (cg = 0; cg < prefcg; cg++)
717 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
718 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
719 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
720 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
721 				return ((ino_t)(fs->fs_ipg * cg));
722 		}
723 	/*
724 	 * This is a backstop when we are deficient in space.
725 	 */
726 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
727 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
728 			return ((ino_t)(fs->fs_ipg * cg));
729 	for (cg = 0; cg < prefcg; cg++)
730 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
731 			break;
732 	return ((ino_t)(fs->fs_ipg * cg));
733 }
734 
735 /*
736  * Select the desired position for the next block in a file.  The file is
737  * logically divided into sections. The first section is composed of the
738  * direct blocks. Each additional section contains fs_maxbpg blocks.
739  *
740  * If no blocks have been allocated in the first section, the policy is to
741  * request a block in the same cylinder group as the inode that describes
742  * the file. If no blocks have been allocated in any other section, the
743  * policy is to place the section in a cylinder group with a greater than
744  * average number of free blocks.  An appropriate cylinder group is found
745  * by using a rotor that sweeps the cylinder groups. When a new group of
746  * blocks is needed, the sweep begins in the cylinder group following the
747  * cylinder group from which the previous allocation was made. The sweep
748  * continues until a cylinder group with greater than the average number
749  * of free blocks is found. If the allocation is for the first block in an
750  * indirect block, the information on the previous allocation is unavailable;
751  * here a best guess is made based upon the logical block number being
752  * allocated.
753  *
754  * If a section is already partially allocated, the policy is to
755  * contiguously allocate fs_maxcontig blocks.  The end of one of these
756  * contiguous blocks and the beginning of the next is laid out
757  * contigously if possible.
758  *
759  * => um_lock held on entry and exit
760  */
761 daddr_t
762 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
763     int32_t *bap /* XXX ondisk32 */)
764 {
765 	struct fs *fs;
766 	int cg;
767 	int avgbfree, startcg;
768 
769 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
770 
771 	fs = ip->i_fs;
772 
773 	/*
774 	 * If allocating a contiguous file with B_CONTIG, use the hints
775 	 * in the inode extentions to return the desired block.
776 	 *
777 	 * For metadata (indirect blocks) return the address of where
778 	 * the first indirect block resides - we'll scan for the next
779 	 * available slot if we need to allocate more than one indirect
780 	 * block.  For data, return the address of the actual block
781 	 * relative to the address of the first data block.
782 	 */
783 	if (flags & B_CONTIG) {
784 		KASSERT(ip->i_ffs_first_data_blk != 0);
785 		KASSERT(ip->i_ffs_first_indir_blk != 0);
786 		if (flags & B_METAONLY)
787 			return ip->i_ffs_first_indir_blk;
788 		else
789 			return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
790 	}
791 
792 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
793 		if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
794 			cg = ino_to_cg(fs, ip->i_number);
795 			return (cgbase(fs, cg) + fs->fs_frag);
796 		}
797 		/*
798 		 * Find a cylinder with greater than average number of
799 		 * unused data blocks.
800 		 */
801 		if (indx == 0 || bap[indx - 1] == 0)
802 			startcg =
803 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
804 		else
805 			startcg = dtog(fs,
806 				ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
807 		startcg %= fs->fs_ncg;
808 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
809 		for (cg = startcg; cg < fs->fs_ncg; cg++)
810 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
811 				return (cgbase(fs, cg) + fs->fs_frag);
812 			}
813 		for (cg = 0; cg < startcg; cg++)
814 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
815 				return (cgbase(fs, cg) + fs->fs_frag);
816 			}
817 		return (0);
818 	}
819 	/*
820 	 * We just always try to lay things out contiguously.
821 	 */
822 	return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
823 }
824 
825 daddr_t
826 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
827     int64_t *bap)
828 {
829 	struct fs *fs;
830 	int cg;
831 	int avgbfree, startcg;
832 
833 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
834 
835 	fs = ip->i_fs;
836 
837 	/*
838 	 * If allocating a contiguous file with B_CONTIG, use the hints
839 	 * in the inode extentions to return the desired block.
840 	 *
841 	 * For metadata (indirect blocks) return the address of where
842 	 * the first indirect block resides - we'll scan for the next
843 	 * available slot if we need to allocate more than one indirect
844 	 * block.  For data, return the address of the actual block
845 	 * relative to the address of the first data block.
846 	 */
847 	if (flags & B_CONTIG) {
848 		KASSERT(ip->i_ffs_first_data_blk != 0);
849 		KASSERT(ip->i_ffs_first_indir_blk != 0);
850 		if (flags & B_METAONLY)
851 			return ip->i_ffs_first_indir_blk;
852 		else
853 			return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
854 	}
855 
856 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
857 		if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
858 			cg = ino_to_cg(fs, ip->i_number);
859 			return (cgbase(fs, cg) + fs->fs_frag);
860 		}
861 		/*
862 		 * Find a cylinder with greater than average number of
863 		 * unused data blocks.
864 		 */
865 		if (indx == 0 || bap[indx - 1] == 0)
866 			startcg =
867 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
868 		else
869 			startcg = dtog(fs,
870 				ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
871 		startcg %= fs->fs_ncg;
872 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
873 		for (cg = startcg; cg < fs->fs_ncg; cg++)
874 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
875 				return (cgbase(fs, cg) + fs->fs_frag);
876 			}
877 		for (cg = 0; cg < startcg; cg++)
878 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
879 				return (cgbase(fs, cg) + fs->fs_frag);
880 			}
881 		return (0);
882 	}
883 	/*
884 	 * We just always try to lay things out contiguously.
885 	 */
886 	return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
887 }
888 
889 
890 /*
891  * Implement the cylinder overflow algorithm.
892  *
893  * The policy implemented by this algorithm is:
894  *   1) allocate the block in its requested cylinder group.
895  *   2) quadradically rehash on the cylinder group number.
896  *   3) brute force search for a free block.
897  *
898  * => called with um_lock held
899  * => returns with um_lock released on success, held on failure
900  *    (*allocator releases lock on success, retains lock on failure)
901  */
902 /*VARARGS5*/
903 static daddr_t
904 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
905     int size /* size for data blocks, mode for inodes */,
906     int realsize,
907     int flags,
908     daddr_t (*allocator)(struct inode *, int, daddr_t, int, int, int))
909 {
910 	struct fs *fs;
911 	daddr_t result;
912 	int i, icg = cg;
913 
914 	fs = ip->i_fs;
915 	/*
916 	 * 1: preferred cylinder group
917 	 */
918 	result = (*allocator)(ip, cg, pref, size, realsize, flags);
919 	if (result)
920 		return (result);
921 
922 	if (flags & B_CONTIG)
923 		return (result);
924 	/*
925 	 * 2: quadratic rehash
926 	 */
927 	for (i = 1; i < fs->fs_ncg; i *= 2) {
928 		cg += i;
929 		if (cg >= fs->fs_ncg)
930 			cg -= fs->fs_ncg;
931 		result = (*allocator)(ip, cg, 0, size, realsize, flags);
932 		if (result)
933 			return (result);
934 	}
935 	/*
936 	 * 3: brute force search
937 	 * Note that we start at i == 2, since 0 was checked initially,
938 	 * and 1 is always checked in the quadratic rehash.
939 	 */
940 	cg = (icg + 2) % fs->fs_ncg;
941 	for (i = 2; i < fs->fs_ncg; i++) {
942 		result = (*allocator)(ip, cg, 0, size, realsize, flags);
943 		if (result)
944 			return (result);
945 		cg++;
946 		if (cg == fs->fs_ncg)
947 			cg = 0;
948 	}
949 	return (0);
950 }
951 
952 /*
953  * Determine whether a fragment can be extended.
954  *
955  * Check to see if the necessary fragments are available, and
956  * if they are, allocate them.
957  *
958  * => called with um_lock held
959  * => returns with um_lock released on success, held on failure
960  */
961 static daddr_t
962 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
963 {
964 	struct ufsmount *ump;
965 	struct fs *fs;
966 	struct cg *cgp;
967 	struct buf *bp;
968 	daddr_t bno;
969 	int frags, bbase;
970 	int i, error;
971 	u_int8_t *blksfree;
972 
973 	fs = ip->i_fs;
974 	ump = ip->i_ump;
975 
976 	KASSERT(mutex_owned(&ump->um_lock));
977 
978 	if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
979 		return (0);
980 	frags = ffs_numfrags(fs, nsize);
981 	bbase = ffs_fragnum(fs, bprev);
982 	if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
983 		/* cannot extend across a block boundary */
984 		return (0);
985 	}
986 	mutex_exit(&ump->um_lock);
987 	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
988 		(int)fs->fs_cgsize, B_MODIFY, &bp);
989 	if (error)
990 		goto fail;
991 	cgp = (struct cg *)bp->b_data;
992 	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
993 		goto fail;
994 	cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
995 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
996 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
997 		cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
998 	bno = dtogd(fs, bprev);
999 	blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
1000 	for (i = ffs_numfrags(fs, osize); i < frags; i++)
1001 		if (isclr(blksfree, bno + i))
1002 			goto fail;
1003 	/*
1004 	 * the current fragment can be extended
1005 	 * deduct the count on fragment being extended into
1006 	 * increase the count on the remaining fragment (if any)
1007 	 * allocate the extended piece
1008 	 */
1009 	for (i = frags; i < fs->fs_frag - bbase; i++)
1010 		if (isclr(blksfree, bno + i))
1011 			break;
1012 	ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
1013 	if (i != frags)
1014 		ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
1015 	mutex_enter(&ump->um_lock);
1016 	for (i = ffs_numfrags(fs, osize); i < frags; i++) {
1017 		clrbit(blksfree, bno + i);
1018 		ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
1019 		fs->fs_cstotal.cs_nffree--;
1020 		fs->fs_cs(fs, cg).cs_nffree--;
1021 	}
1022 	fs->fs_fmod = 1;
1023 	ACTIVECG_CLR(fs, cg);
1024 	mutex_exit(&ump->um_lock);
1025 	bdwrite(bp);
1026 	return (bprev);
1027 
1028  fail:
1029  	if (bp != NULL)
1030 		brelse(bp, 0);
1031  	mutex_enter(&ump->um_lock);
1032  	return (0);
1033 }
1034 
1035 /*
1036  * Determine whether a block can be allocated.
1037  *
1038  * Check to see if a block of the appropriate size is available,
1039  * and if it is, allocate it.
1040  */
1041 static daddr_t
1042 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int realsize,
1043     int flags)
1044 {
1045 	struct ufsmount *ump;
1046 	struct fs *fs = ip->i_fs;
1047 	struct cg *cgp;
1048 	struct buf *bp;
1049 	int32_t bno;
1050 	daddr_t blkno;
1051 	int error, frags, allocsiz, i;
1052 	u_int8_t *blksfree;
1053 	const int needswap = UFS_FSNEEDSWAP(fs);
1054 
1055 	ump = ip->i_ump;
1056 
1057 	KASSERT(mutex_owned(&ump->um_lock));
1058 
1059 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
1060 		return (0);
1061 	mutex_exit(&ump->um_lock);
1062 	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1063 		(int)fs->fs_cgsize, B_MODIFY, &bp);
1064 	if (error)
1065 		goto fail;
1066 	cgp = (struct cg *)bp->b_data;
1067 	if (!cg_chkmagic(cgp, needswap) ||
1068 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
1069 		goto fail;
1070 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1071 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1072 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1073 		cgp->cg_time = ufs_rw64(time_second, needswap);
1074 	if (size == fs->fs_bsize) {
1075 		mutex_enter(&ump->um_lock);
1076 		blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1077 		ACTIVECG_CLR(fs, cg);
1078 		mutex_exit(&ump->um_lock);
1079 
1080 		/*
1081 		 * If actually needed size is lower, free the extra blocks now.
1082 		 * This is safe to call here, there is no outside reference
1083 		 * to this block yet. It is not necessary to keep um_lock
1084 		 * locked.
1085 		 */
1086 		if (realsize != 0 && realsize < size) {
1087 			ffs_blkfree_common(ip->i_ump, ip->i_fs,
1088 			    ip->i_devvp->v_rdev,
1089 			    bp, blkno + ffs_numfrags(fs, realsize),
1090 			    (long)(size - realsize), false);
1091 		}
1092 
1093 		bdwrite(bp);
1094 		return (blkno);
1095 	}
1096 	/*
1097 	 * check to see if any fragments are already available
1098 	 * allocsiz is the size which will be allocated, hacking
1099 	 * it down to a smaller size if necessary
1100 	 */
1101 	blksfree = cg_blksfree(cgp, needswap);
1102 	frags = ffs_numfrags(fs, size);
1103 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1104 		if (cgp->cg_frsum[allocsiz] != 0)
1105 			break;
1106 	if (allocsiz == fs->fs_frag) {
1107 		/*
1108 		 * no fragments were available, so a block will be
1109 		 * allocated, and hacked up
1110 		 */
1111 		if (cgp->cg_cs.cs_nbfree == 0)
1112 			goto fail;
1113 		mutex_enter(&ump->um_lock);
1114 		blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1115 		bno = dtogd(fs, blkno);
1116 		for (i = frags; i < fs->fs_frag; i++)
1117 			setbit(blksfree, bno + i);
1118 		i = fs->fs_frag - frags;
1119 		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1120 		fs->fs_cstotal.cs_nffree += i;
1121 		fs->fs_cs(fs, cg).cs_nffree += i;
1122 		fs->fs_fmod = 1;
1123 		ufs_add32(cgp->cg_frsum[i], 1, needswap);
1124 		ACTIVECG_CLR(fs, cg);
1125 		mutex_exit(&ump->um_lock);
1126 		bdwrite(bp);
1127 		return (blkno);
1128 	}
1129 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1130 #if 0
1131 	/*
1132 	 * XXX fvdl mapsearch will panic, and never return -1
1133 	 *          also: returning NULL as daddr_t ?
1134 	 */
1135 	if (bno < 0)
1136 		goto fail;
1137 #endif
1138 	for (i = 0; i < frags; i++)
1139 		clrbit(blksfree, bno + i);
1140 	mutex_enter(&ump->um_lock);
1141 	ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
1142 	fs->fs_cstotal.cs_nffree -= frags;
1143 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1144 	fs->fs_fmod = 1;
1145 	ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
1146 	if (frags != allocsiz)
1147 		ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
1148 	blkno = cgbase(fs, cg) + bno;
1149 	ACTIVECG_CLR(fs, cg);
1150 	mutex_exit(&ump->um_lock);
1151 	bdwrite(bp);
1152 	return blkno;
1153 
1154  fail:
1155  	if (bp != NULL)
1156 		brelse(bp, 0);
1157  	mutex_enter(&ump->um_lock);
1158  	return (0);
1159 }
1160 
1161 /*
1162  * Allocate a block in a cylinder group.
1163  *
1164  * This algorithm implements the following policy:
1165  *   1) allocate the requested block.
1166  *   2) allocate a rotationally optimal block in the same cylinder.
1167  *   3) allocate the next available block on the block rotor for the
1168  *      specified cylinder group.
1169  * Note that this routine only allocates fs_bsize blocks; these
1170  * blocks may be fragmented by the routine that allocates them.
1171  */
1172 static daddr_t
1173 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize,
1174     int flags)
1175 {
1176 	struct fs *fs = ip->i_fs;
1177 	struct cg *cgp;
1178 	int cg;
1179 	daddr_t blkno;
1180 	int32_t bno;
1181 	u_int8_t *blksfree;
1182 	const int needswap = UFS_FSNEEDSWAP(fs);
1183 
1184 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
1185 
1186 	cgp = (struct cg *)bp->b_data;
1187 	blksfree = cg_blksfree(cgp, needswap);
1188 	if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
1189 		bpref = ufs_rw32(cgp->cg_rotor, needswap);
1190 	} else {
1191 		bpref = ffs_blknum(fs, bpref);
1192 		bno = dtogd(fs, bpref);
1193 		/*
1194 		 * if the requested block is available, use it
1195 		 */
1196 		if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
1197 			goto gotit;
1198 		/*
1199 		 * if the requested data block isn't available and we are
1200 		 * trying to allocate a contiguous file, return an error.
1201 		 */
1202 		if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
1203 			return (0);
1204 	}
1205 
1206 	/*
1207 	 * Take the next available block in this cylinder group.
1208 	 */
1209 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1210 #if 0
1211 	/*
1212 	 * XXX jdolecek ffs_mapsearch() succeeds or panics
1213 	 */
1214 	if (bno < 0)
1215 		return (0);
1216 #endif
1217 	cgp->cg_rotor = ufs_rw32(bno, needswap);
1218 gotit:
1219 	blkno = ffs_fragstoblks(fs, bno);
1220 	ffs_clrblock(fs, blksfree, blkno);
1221 	ffs_clusteracct(fs, cgp, blkno, -1);
1222 	ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1223 	fs->fs_cstotal.cs_nbfree--;
1224 	fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
1225 	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1226 	    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1227 		int cylno;
1228 		cylno = old_cbtocylno(fs, bno);
1229 		KASSERT(cylno >= 0);
1230 		KASSERT(cylno < fs->fs_old_ncyl);
1231 		KASSERT(old_cbtorpos(fs, bno) >= 0);
1232 		KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
1233 		ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
1234 		    needswap);
1235 		ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
1236 	}
1237 	fs->fs_fmod = 1;
1238 	cg = ufs_rw32(cgp->cg_cgx, needswap);
1239 	blkno = cgbase(fs, cg) + bno;
1240 	return (blkno);
1241 }
1242 
1243 /*
1244  * Determine whether an inode can be allocated.
1245  *
1246  * Check to see if an inode is available, and if it is,
1247  * allocate it using the following policy:
1248  *   1) allocate the requested inode.
1249  *   2) allocate the next available inode after the requested
1250  *      inode in the specified cylinder group.
1251  */
1252 static daddr_t
1253 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int realsize,
1254     int flags)
1255 {
1256 	struct ufsmount *ump = ip->i_ump;
1257 	struct fs *fs = ip->i_fs;
1258 	struct cg *cgp;
1259 	struct buf *bp, *ibp;
1260 	u_int8_t *inosused;
1261 	int error, start, len, loc, map, i;
1262 	int32_t initediblk;
1263 	daddr_t nalloc;
1264 	struct ufs2_dinode *dp2;
1265 	const int needswap = UFS_FSNEEDSWAP(fs);
1266 
1267 	KASSERT(mutex_owned(&ump->um_lock));
1268 	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
1269 
1270 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1271 		return (0);
1272 	mutex_exit(&ump->um_lock);
1273 	ibp = NULL;
1274 	initediblk = -1;
1275 retry:
1276 	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1277 		(int)fs->fs_cgsize, B_MODIFY, &bp);
1278 	if (error)
1279 		goto fail;
1280 	cgp = (struct cg *)bp->b_data;
1281 	if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
1282 		goto fail;
1283 
1284 	if (ibp != NULL &&
1285 	    initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
1286 		/* Another thread allocated more inodes so we retry the test. */
1287 		brelse(ibp, 0);
1288 		ibp = NULL;
1289 	}
1290 	/*
1291 	 * Check to see if we need to initialize more inodes.
1292 	 */
1293 	if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
1294 		initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
1295 		nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
1296 		if (nalloc + FFS_INOPB(fs) > initediblk &&
1297 		    initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
1298 			/*
1299 			 * We have to release the cg buffer here to prevent
1300 			 * a deadlock when reading the inode block will
1301 			 * run a copy-on-write that might use this cg.
1302 			 */
1303 			brelse(bp, 0);
1304 			bp = NULL;
1305 			error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
1306 			    ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
1307 			    FFS_NOBLK, fs->fs_bsize, false, &ibp);
1308 			if (error)
1309 				goto fail;
1310 			goto retry;
1311 		}
1312 	}
1313 
1314 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1315 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1316 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1317 		cgp->cg_time = ufs_rw64(time_second, needswap);
1318 	inosused = cg_inosused(cgp, needswap);
1319 	if (ipref) {
1320 		ipref %= fs->fs_ipg;
1321 		if (isclr(inosused, ipref))
1322 			goto gotit;
1323 	}
1324 	start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
1325 	len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
1326 		NBBY);
1327 	loc = skpc(0xff, len, &inosused[start]);
1328 	if (loc == 0) {
1329 		len = start + 1;
1330 		start = 0;
1331 		loc = skpc(0xff, len, &inosused[0]);
1332 		if (loc == 0) {
1333 			panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
1334 			    __func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
1335 			    fs->fs_fsmnt);
1336 			/* NOTREACHED */
1337 		}
1338 	}
1339 	i = start + len - loc;
1340 	map = inosused[i] ^ 0xff;
1341 	if (map == 0) {
1342 		panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
1343 	}
1344 	ipref = i * NBBY + ffs(map) - 1;
1345 	cgp->cg_irotor = ufs_rw32(ipref, needswap);
1346 gotit:
1347 	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
1348 	    mode);
1349 	/*
1350 	 * Check to see if we need to initialize more inodes.
1351 	 */
1352 	if (ibp != NULL) {
1353 		KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
1354 		memset(ibp->b_data, 0, fs->fs_bsize);
1355 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
1356 		for (i = 0; i < FFS_INOPB(fs); i++) {
1357 			/*
1358 			 * Don't bother to swap, it's supposed to be
1359 			 * random, after all.
1360 			 */
1361 			dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
1362 			dp2++;
1363 		}
1364 		initediblk += FFS_INOPB(fs);
1365 		cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
1366 	}
1367 
1368 	mutex_enter(&ump->um_lock);
1369 	ACTIVECG_CLR(fs, cg);
1370 	setbit(inosused, ipref);
1371 	ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
1372 	fs->fs_cstotal.cs_nifree--;
1373 	fs->fs_cs(fs, cg).cs_nifree--;
1374 	fs->fs_fmod = 1;
1375 	if ((mode & IFMT) == IFDIR) {
1376 		ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
1377 		fs->fs_cstotal.cs_ndir++;
1378 		fs->fs_cs(fs, cg).cs_ndir++;
1379 	}
1380 	mutex_exit(&ump->um_lock);
1381 	if (ibp != NULL) {
1382 		bwrite(ibp);
1383 		bwrite(bp);
1384 	} else
1385 		bdwrite(bp);
1386 	return (cg * fs->fs_ipg + ipref);
1387  fail:
1388 	if (bp != NULL)
1389 		brelse(bp, 0);
1390 	if (ibp != NULL)
1391 		brelse(ibp, 0);
1392 	mutex_enter(&ump->um_lock);
1393 	return (0);
1394 }
1395 
1396 /*
1397  * Allocate a block or fragment.
1398  *
1399  * The specified block or fragment is removed from the
1400  * free map, possibly fragmenting a block in the process.
1401  *
1402  * This implementation should mirror fs_blkfree
1403  *
1404  * => um_lock not held on entry or exit
1405  */
1406 int
1407 ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
1408 {
1409 	int error;
1410 
1411 	error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
1412 	    ip->i_dev, ip->i_uid);
1413 	if (error)
1414 		return error;
1415 
1416 	return ffs_blkalloc_ump(ip->i_ump, bno, size);
1417 }
1418 
1419 int
1420 ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
1421 {
1422 	struct fs *fs = ump->um_fs;
1423 	struct cg *cgp;
1424 	struct buf *bp;
1425 	int32_t fragno, cgbno;
1426 	int i, error, cg, blk, frags, bbase;
1427 	u_int8_t *blksfree;
1428 	const int needswap = UFS_FSNEEDSWAP(fs);
1429 
1430 	KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
1431 	    ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
1432 	KASSERT(bno < fs->fs_size);
1433 
1434 	cg = dtog(fs, bno);
1435 	error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1436 		(int)fs->fs_cgsize, B_MODIFY, &bp);
1437 	if (error) {
1438 		return error;
1439 	}
1440 	cgp = (struct cg *)bp->b_data;
1441 	if (!cg_chkmagic(cgp, needswap)) {
1442 		brelse(bp, 0);
1443 		return EIO;
1444 	}
1445 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1446 	cgp->cg_time = ufs_rw64(time_second, needswap);
1447 	cgbno = dtogd(fs, bno);
1448 	blksfree = cg_blksfree(cgp, needswap);
1449 
1450 	mutex_enter(&ump->um_lock);
1451 	if (size == fs->fs_bsize) {
1452 		fragno = ffs_fragstoblks(fs, cgbno);
1453 		if (!ffs_isblock(fs, blksfree, fragno)) {
1454 			mutex_exit(&ump->um_lock);
1455 			brelse(bp, 0);
1456 			return EBUSY;
1457 		}
1458 		ffs_clrblock(fs, blksfree, fragno);
1459 		ffs_clusteracct(fs, cgp, fragno, -1);
1460 		ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1461 		fs->fs_cstotal.cs_nbfree--;
1462 		fs->fs_cs(fs, cg).cs_nbfree--;
1463 	} else {
1464 		bbase = cgbno - ffs_fragnum(fs, cgbno);
1465 
1466 		frags = ffs_numfrags(fs, size);
1467 		for (i = 0; i < frags; i++) {
1468 			if (isclr(blksfree, cgbno + i)) {
1469 				mutex_exit(&ump->um_lock);
1470 				brelse(bp, 0);
1471 				return EBUSY;
1472 			}
1473 		}
1474 		/*
1475 		 * if a complete block is being split, account for it
1476 		 */
1477 		fragno = ffs_fragstoblks(fs, bbase);
1478 		if (ffs_isblock(fs, blksfree, fragno)) {
1479 			ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
1480 			fs->fs_cstotal.cs_nffree += fs->fs_frag;
1481 			fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
1482 			ffs_clusteracct(fs, cgp, fragno, -1);
1483 			ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1484 			fs->fs_cstotal.cs_nbfree--;
1485 			fs->fs_cs(fs, cg).cs_nbfree--;
1486 		}
1487 		/*
1488 		 * decrement the counts associated with the old frags
1489 		 */
1490 		blk = blkmap(fs, blksfree, bbase);
1491 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1492 		/*
1493 		 * allocate the fragment
1494 		 */
1495 		for (i = 0; i < frags; i++) {
1496 			clrbit(blksfree, cgbno + i);
1497 		}
1498 		ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
1499 		fs->fs_cstotal.cs_nffree -= i;
1500 		fs->fs_cs(fs, cg).cs_nffree -= i;
1501 		/*
1502 		 * add back in counts associated with the new frags
1503 		 */
1504 		blk = blkmap(fs, blksfree, bbase);
1505 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1506 	}
1507 	fs->fs_fmod = 1;
1508 	ACTIVECG_CLR(fs, cg);
1509 	mutex_exit(&ump->um_lock);
1510 	bdwrite(bp);
1511 	return 0;
1512 }
1513 
1514 /*
1515  * Free a block or fragment.
1516  *
1517  * The specified block or fragment is placed back in the
1518  * free map. If a fragment is deallocated, a possible
1519  * block reassembly is checked.
1520  *
1521  * => um_lock not held on entry or exit
1522  */
1523 static void
1524 ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
1525 {
1526 	struct cg *cgp;
1527 	struct buf *bp;
1528 	struct ufsmount *ump;
1529 	daddr_t cgblkno;
1530 	int error, cg;
1531 	dev_t dev;
1532 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1533 	const int needswap = UFS_FSNEEDSWAP(fs);
1534 
1535 	KASSERT(!devvp_is_snapshot);
1536 
1537 	cg = dtog(fs, bno);
1538 	dev = devvp->v_rdev;
1539 	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1540 	KASSERT(fs == ump->um_fs);
1541 	cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
1542 
1543 	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1544 	    B_MODIFY, &bp);
1545 	if (error) {
1546 		return;
1547 	}
1548 	cgp = (struct cg *)bp->b_data;
1549 	if (!cg_chkmagic(cgp, needswap)) {
1550 		brelse(bp, 0);
1551 		return;
1552 	}
1553 
1554 	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1555 
1556 	bdwrite(bp);
1557 }
1558 
1559 struct discardopdata {
1560 	struct work wk; /* must be first */
1561 	struct vnode *devvp;
1562 	daddr_t bno;
1563 	long size;
1564 };
1565 
1566 struct discarddata {
1567 	struct fs *fs;
1568 	struct discardopdata *entry;
1569 	long maxsize;
1570 	kmutex_t entrylk;
1571 	struct workqueue *wq;
1572 	int wqcnt, wqdraining;
1573 	kmutex_t wqlk;
1574 	kcondvar_t wqcv;
1575 	/* timer for flush? */
1576 };
1577 
1578 static void
1579 ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
1580 {
1581 	struct mount *mp = spec_node_getmountedfs(td->devvp);
1582 	long todo;
1583 	int error;
1584 
1585 	while (td->size) {
1586 		todo = min(td->size,
1587 		  ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
1588 		error = UFS_WAPBL_BEGIN(mp);
1589 		if (error) {
1590 			printf("ffs: failed to begin wapbl transaction"
1591 			    " for discard: %d\n", error);
1592 			break;
1593 		}
1594 		ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
1595 		UFS_WAPBL_END(mp);
1596 		td->bno += ffs_numfrags(fs, todo);
1597 		td->size -= todo;
1598 	}
1599 }
1600 
1601 static void
1602 ffs_discardcb(struct work *wk, void *arg)
1603 {
1604 	struct discardopdata *td = (void *)wk;
1605 	struct discarddata *ts = arg;
1606 	struct fs *fs = ts->fs;
1607 	off_t start, len;
1608 #ifdef TRIMDEBUG
1609 	int error;
1610 #endif
1611 
1612 /* like FSBTODB but emits bytes; XXX move to fs.h */
1613 #ifndef FFS_FSBTOBYTES
1614 #define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
1615 #endif
1616 
1617 	start = FFS_FSBTOBYTES(fs, td->bno);
1618 	len = td->size;
1619 #ifdef TRIMDEBUG
1620 	error =
1621 #endif
1622 		VOP_FDISCARD(td->devvp, start, len);
1623 #ifdef TRIMDEBUG
1624 	printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
1625 #endif
1626 
1627 	ffs_blkfree_td(fs, td);
1628 	kmem_free(td, sizeof(*td));
1629 	mutex_enter(&ts->wqlk);
1630 	ts->wqcnt--;
1631 	if (ts->wqdraining && !ts->wqcnt)
1632 		cv_signal(&ts->wqcv);
1633 	mutex_exit(&ts->wqlk);
1634 }
1635 
1636 void *
1637 ffs_discard_init(struct vnode *devvp, struct fs *fs)
1638 {
1639 	struct discarddata *ts;
1640 	int error;
1641 
1642 	ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
1643 	error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
1644 				 0, 0, 0);
1645 	if (error) {
1646 		kmem_free(ts, sizeof (*ts));
1647 		return NULL;
1648 	}
1649 	mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
1650 	mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
1651 	cv_init(&ts->wqcv, "trimwqcv");
1652 	ts->maxsize = 100*1024; /* XXX */
1653 	ts->fs = fs;
1654 	return ts;
1655 }
1656 
1657 void
1658 ffs_discard_finish(void *vts, int flags)
1659 {
1660 	struct discarddata *ts = vts;
1661 	struct discardopdata *td = NULL;
1662 
1663 	/* wait for workqueue to drain */
1664 	mutex_enter(&ts->wqlk);
1665 	if (ts->wqcnt) {
1666 		ts->wqdraining = 1;
1667 		cv_wait(&ts->wqcv, &ts->wqlk);
1668 	}
1669 	mutex_exit(&ts->wqlk);
1670 
1671 	mutex_enter(&ts->entrylk);
1672 	if (ts->entry) {
1673 		td = ts->entry;
1674 		ts->entry = NULL;
1675 	}
1676 	mutex_exit(&ts->entrylk);
1677 	if (td) {
1678 		/* XXX don't tell disk, its optional */
1679 		ffs_blkfree_td(ts->fs, td);
1680 #ifdef TRIMDEBUG
1681 		printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
1682 #endif
1683 		kmem_free(td, sizeof(*td));
1684 	}
1685 
1686 	cv_destroy(&ts->wqcv);
1687 	mutex_destroy(&ts->entrylk);
1688 	mutex_destroy(&ts->wqlk);
1689 	workqueue_destroy(ts->wq);
1690 	kmem_free(ts, sizeof(*ts));
1691 }
1692 
1693 void
1694 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1695     ino_t inum)
1696 {
1697 	struct ufsmount *ump;
1698 	int error;
1699 	dev_t dev;
1700 	struct discarddata *ts;
1701 	struct discardopdata *td;
1702 
1703 	dev = devvp->v_rdev;
1704 	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1705 	if (ffs_snapblkfree(fs, devvp, bno, size, inum))
1706 		return;
1707 
1708 	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1709 	if (error)
1710 		return;
1711 
1712 	if (!ump->um_discarddata) {
1713 		ffs_blkfree_cg(fs, devvp, bno, size);
1714 		return;
1715 	}
1716 
1717 #ifdef TRIMDEBUG
1718 	printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
1719 #endif
1720 	ts = ump->um_discarddata;
1721 	td = NULL;
1722 
1723 	mutex_enter(&ts->entrylk);
1724 	if (ts->entry) {
1725 		td = ts->entry;
1726 		/* ffs deallocs backwards, check for prepend only */
1727 		if (td->bno == bno + ffs_numfrags(fs, size)
1728 		    && td->size + size <= ts->maxsize) {
1729 			td->bno = bno;
1730 			td->size += size;
1731 			if (td->size < ts->maxsize) {
1732 #ifdef TRIMDEBUG
1733 				printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1734 #endif
1735 				mutex_exit(&ts->entrylk);
1736 				return;
1737 			}
1738 			size = 0; /* mark done */
1739 		}
1740 		ts->entry = NULL;
1741 	}
1742 	mutex_exit(&ts->entrylk);
1743 
1744 	if (td) {
1745 #ifdef TRIMDEBUG
1746 		printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
1747 #endif
1748 		mutex_enter(&ts->wqlk);
1749 		ts->wqcnt++;
1750 		mutex_exit(&ts->wqlk);
1751 		workqueue_enqueue(ts->wq, &td->wk, NULL);
1752 	}
1753 	if (!size)
1754 		return;
1755 
1756 	td = kmem_alloc(sizeof(*td), KM_SLEEP);
1757 	td->devvp = devvp;
1758 	td->bno = bno;
1759 	td->size = size;
1760 
1761 	if (td->size < ts->maxsize) { /* XXX always the case */
1762 		mutex_enter(&ts->entrylk);
1763 		if (!ts->entry) { /* possible race? */
1764 #ifdef TRIMDEBUG
1765 			printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1766 #endif
1767 			ts->entry = td;
1768 			td = NULL;
1769 		}
1770 		mutex_exit(&ts->entrylk);
1771 	}
1772 	if (td) {
1773 #ifdef TRIMDEBUG
1774 		printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
1775 #endif
1776 		mutex_enter(&ts->wqlk);
1777 		ts->wqcnt++;
1778 		mutex_exit(&ts->wqlk);
1779 		workqueue_enqueue(ts->wq, &td->wk, NULL);
1780 	}
1781 }
1782 
1783 /*
1784  * Free a block or fragment from a snapshot cg copy.
1785  *
1786  * The specified block or fragment is placed back in the
1787  * free map. If a fragment is deallocated, a possible
1788  * block reassembly is checked.
1789  *
1790  * => um_lock not held on entry or exit
1791  */
1792 void
1793 ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1794     ino_t inum)
1795 {
1796 	struct cg *cgp;
1797 	struct buf *bp;
1798 	struct ufsmount *ump;
1799 	daddr_t cgblkno;
1800 	int error, cg;
1801 	dev_t dev;
1802 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1803 	const int needswap = UFS_FSNEEDSWAP(fs);
1804 
1805 	KASSERT(devvp_is_snapshot);
1806 
1807 	cg = dtog(fs, bno);
1808 	dev = VTOI(devvp)->i_devvp->v_rdev;
1809 	ump = VFSTOUFS(devvp->v_mount);
1810 	cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
1811 
1812 	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1813 	if (error)
1814 		return;
1815 
1816 	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1817 	    B_MODIFY, &bp);
1818 	if (error) {
1819 		return;
1820 	}
1821 	cgp = (struct cg *)bp->b_data;
1822 	if (!cg_chkmagic(cgp, needswap)) {
1823 		brelse(bp, 0);
1824 		return;
1825 	}
1826 
1827 	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1828 
1829 	bdwrite(bp);
1830 }
1831 
1832 static void
1833 ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
1834     struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
1835 {
1836 	struct cg *cgp;
1837 	int32_t fragno, cgbno;
1838 	int i, cg, blk, frags, bbase;
1839 	u_int8_t *blksfree;
1840 	const int needswap = UFS_FSNEEDSWAP(fs);
1841 
1842 	cg = dtog(fs, bno);
1843 	cgp = (struct cg *)bp->b_data;
1844 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1845 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1846 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1847 		cgp->cg_time = ufs_rw64(time_second, needswap);
1848 	cgbno = dtogd(fs, bno);
1849 	blksfree = cg_blksfree(cgp, needswap);
1850 	mutex_enter(&ump->um_lock);
1851 	if (size == fs->fs_bsize) {
1852 		fragno = ffs_fragstoblks(fs, cgbno);
1853 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
1854 			if (devvp_is_snapshot) {
1855 				mutex_exit(&ump->um_lock);
1856 				return;
1857 			}
1858 			panic("%s: freeing free block: dev = 0x%llx, block = %"
1859 			    PRId64 ", fs = %s", __func__,
1860 			    (unsigned long long)dev, bno, fs->fs_fsmnt);
1861 		}
1862 		ffs_setblock(fs, blksfree, fragno);
1863 		ffs_clusteracct(fs, cgp, fragno, 1);
1864 		ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1865 		fs->fs_cstotal.cs_nbfree++;
1866 		fs->fs_cs(fs, cg).cs_nbfree++;
1867 		if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1868 		    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1869 			i = old_cbtocylno(fs, cgbno);
1870 			KASSERT(i >= 0);
1871 			KASSERT(i < fs->fs_old_ncyl);
1872 			KASSERT(old_cbtorpos(fs, cgbno) >= 0);
1873 			KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
1874 			ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
1875 			    needswap);
1876 			ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1877 		}
1878 	} else {
1879 		bbase = cgbno - ffs_fragnum(fs, cgbno);
1880 		/*
1881 		 * decrement the counts associated with the old frags
1882 		 */
1883 		blk = blkmap(fs, blksfree, bbase);
1884 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1885 		/*
1886 		 * deallocate the fragment
1887 		 */
1888 		frags = ffs_numfrags(fs, size);
1889 		for (i = 0; i < frags; i++) {
1890 			if (isset(blksfree, cgbno + i)) {
1891 				panic("%s: freeing free frag: "
1892 				    "dev = 0x%llx, block = %" PRId64
1893 				    ", fs = %s", __func__,
1894 				    (unsigned long long)dev, bno + i,
1895 				    fs->fs_fsmnt);
1896 			}
1897 			setbit(blksfree, cgbno + i);
1898 		}
1899 		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1900 		fs->fs_cstotal.cs_nffree += i;
1901 		fs->fs_cs(fs, cg).cs_nffree += i;
1902 		/*
1903 		 * add back in counts associated with the new frags
1904 		 */
1905 		blk = blkmap(fs, blksfree, bbase);
1906 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1907 		/*
1908 		 * if a complete block has been reassembled, account for it
1909 		 */
1910 		fragno = ffs_fragstoblks(fs, bbase);
1911 		if (ffs_isblock(fs, blksfree, fragno)) {
1912 			ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
1913 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1914 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1915 			ffs_clusteracct(fs, cgp, fragno, 1);
1916 			ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1917 			fs->fs_cstotal.cs_nbfree++;
1918 			fs->fs_cs(fs, cg).cs_nbfree++;
1919 			if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1920 			    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1921 				i = old_cbtocylno(fs, bbase);
1922 				KASSERT(i >= 0);
1923 				KASSERT(i < fs->fs_old_ncyl);
1924 				KASSERT(old_cbtorpos(fs, bbase) >= 0);
1925 				KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
1926 				ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
1927 				    bbase)], 1, needswap);
1928 				ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1929 			}
1930 		}
1931 	}
1932 	fs->fs_fmod = 1;
1933 	ACTIVECG_CLR(fs, cg);
1934 	mutex_exit(&ump->um_lock);
1935 }
1936 
1937 /*
1938  * Free an inode.
1939  */
1940 int
1941 ffs_vfree(struct vnode *vp, ino_t ino, int mode)
1942 {
1943 
1944 	return ffs_freefile(vp->v_mount, ino, mode);
1945 }
1946 
1947 /*
1948  * Do the actual free operation.
1949  * The specified inode is placed back in the free map.
1950  *
1951  * => um_lock not held on entry or exit
1952  */
1953 int
1954 ffs_freefile(struct mount *mp, ino_t ino, int mode)
1955 {
1956 	struct ufsmount *ump = VFSTOUFS(mp);
1957 	struct fs *fs = ump->um_fs;
1958 	struct vnode *devvp;
1959 	struct cg *cgp;
1960 	struct buf *bp;
1961 	int error, cg;
1962 	daddr_t cgbno;
1963 	dev_t dev;
1964 	const int needswap = UFS_FSNEEDSWAP(fs);
1965 
1966 	cg = ino_to_cg(fs, ino);
1967 	devvp = ump->um_devvp;
1968 	dev = devvp->v_rdev;
1969 	cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
1970 
1971 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1972 		panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
1973 		    (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
1974 	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
1975 	    B_MODIFY, &bp);
1976 	if (error) {
1977 		return (error);
1978 	}
1979 	cgp = (struct cg *)bp->b_data;
1980 	if (!cg_chkmagic(cgp, needswap)) {
1981 		brelse(bp, 0);
1982 		return (0);
1983 	}
1984 
1985 	ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
1986 
1987 	bdwrite(bp);
1988 
1989 	return 0;
1990 }
1991 
1992 int
1993 ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
1994 {
1995 	struct ufsmount *ump;
1996 	struct cg *cgp;
1997 	struct buf *bp;
1998 	int error, cg;
1999 	daddr_t cgbno;
2000 	dev_t dev;
2001 	const int needswap = UFS_FSNEEDSWAP(fs);
2002 
2003 	KASSERT(devvp->v_type != VBLK);
2004 
2005 	cg = ino_to_cg(fs, ino);
2006 	dev = VTOI(devvp)->i_devvp->v_rdev;
2007 	ump = VFSTOUFS(devvp->v_mount);
2008 	cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2009 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2010 		panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
2011 		    (unsigned long long)dev, (unsigned long long)ino,
2012 		    fs->fs_fsmnt);
2013 	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2014 	    B_MODIFY, &bp);
2015 	if (error) {
2016 		return (error);
2017 	}
2018 	cgp = (struct cg *)bp->b_data;
2019 	if (!cg_chkmagic(cgp, needswap)) {
2020 		brelse(bp, 0);
2021 		return (0);
2022 	}
2023 	ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
2024 
2025 	bdwrite(bp);
2026 
2027 	return 0;
2028 }
2029 
2030 static void
2031 ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
2032     struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
2033 {
2034 	int cg;
2035 	struct cg *cgp;
2036 	u_int8_t *inosused;
2037 	const int needswap = UFS_FSNEEDSWAP(fs);
2038 
2039 	cg = ino_to_cg(fs, ino);
2040 	cgp = (struct cg *)bp->b_data;
2041 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
2042 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
2043 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
2044 		cgp->cg_time = ufs_rw64(time_second, needswap);
2045 	inosused = cg_inosused(cgp, needswap);
2046 	ino %= fs->fs_ipg;
2047 	if (isclr(inosused, ino)) {
2048 		printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
2049 		    (unsigned long long)dev, (unsigned long long)ino +
2050 		    cg * fs->fs_ipg, fs->fs_fsmnt);
2051 		if (fs->fs_ronly == 0)
2052 			panic("%s: freeing free inode", __func__);
2053 	}
2054 	clrbit(inosused, ino);
2055 	if (!devvp_is_snapshot)
2056 		UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
2057 		    ino + cg * fs->fs_ipg, mode);
2058 	if (ino < ufs_rw32(cgp->cg_irotor, needswap))
2059 		cgp->cg_irotor = ufs_rw32(ino, needswap);
2060 	ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
2061 	mutex_enter(&ump->um_lock);
2062 	fs->fs_cstotal.cs_nifree++;
2063 	fs->fs_cs(fs, cg).cs_nifree++;
2064 	if ((mode & IFMT) == IFDIR) {
2065 		ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
2066 		fs->fs_cstotal.cs_ndir--;
2067 		fs->fs_cs(fs, cg).cs_ndir--;
2068 	}
2069 	fs->fs_fmod = 1;
2070 	ACTIVECG_CLR(fs, cg);
2071 	mutex_exit(&ump->um_lock);
2072 }
2073 
2074 /*
2075  * Check to see if a file is free.
2076  */
2077 int
2078 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
2079 {
2080 	struct cg *cgp;
2081 	struct buf *bp;
2082 	daddr_t cgbno;
2083 	int ret, cg;
2084 	u_int8_t *inosused;
2085 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
2086 
2087 	KASSERT(devvp_is_snapshot);
2088 
2089 	cg = ino_to_cg(fs, ino);
2090 	if (devvp_is_snapshot)
2091 		cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2092 	else
2093 		cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
2094 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2095 		return 1;
2096 	if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) {
2097 		return 1;
2098 	}
2099 	cgp = (struct cg *)bp->b_data;
2100 	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
2101 		brelse(bp, 0);
2102 		return 1;
2103 	}
2104 	inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
2105 	ino %= fs->fs_ipg;
2106 	ret = isclr(inosused, ino);
2107 	brelse(bp, 0);
2108 	return ret;
2109 }
2110 
2111 /*
2112  * Find a block of the specified size in the specified cylinder group.
2113  *
2114  * It is a panic if a request is made to find a block if none are
2115  * available.
2116  */
2117 static int32_t
2118 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
2119 {
2120 	int32_t bno;
2121 	int start, len, loc, i;
2122 	int blk, field, subfield, pos;
2123 	int ostart, olen;
2124 	u_int8_t *blksfree;
2125 	const int needswap = UFS_FSNEEDSWAP(fs);
2126 
2127 	/* KASSERT(mutex_owned(&ump->um_lock)); */
2128 
2129 	/*
2130 	 * find the fragment by searching through the free block
2131 	 * map for an appropriate bit pattern
2132 	 */
2133 	if (bpref)
2134 		start = dtogd(fs, bpref) / NBBY;
2135 	else
2136 		start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
2137 	blksfree = cg_blksfree(cgp, needswap);
2138 	len = howmany(fs->fs_fpg, NBBY) - start;
2139 	ostart = start;
2140 	olen = len;
2141 	loc = scanc((u_int)len,
2142 		(const u_char *)&blksfree[start],
2143 		(const u_char *)fragtbl[fs->fs_frag],
2144 		(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2145 	if (loc == 0) {
2146 		len = start + 1;
2147 		start = 0;
2148 		loc = scanc((u_int)len,
2149 			(const u_char *)&blksfree[0],
2150 			(const u_char *)fragtbl[fs->fs_frag],
2151 			(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2152 		if (loc == 0) {
2153 			panic("%s: map corrupted: start=%d, len=%d, "
2154 			    "fs = %s, offset=%d/%ld, cg %d", __func__,
2155 			    ostart, olen, fs->fs_fsmnt,
2156 			    ufs_rw32(cgp->cg_freeoff, needswap),
2157 			    (long)blksfree - (long)cgp, cgp->cg_cgx);
2158 			/* NOTREACHED */
2159 		}
2160 	}
2161 	bno = (start + len - loc) * NBBY;
2162 	cgp->cg_frotor = ufs_rw32(bno, needswap);
2163 	/*
2164 	 * found the byte in the map
2165 	 * sift through the bits to find the selected frag
2166 	 */
2167 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
2168 		blk = blkmap(fs, blksfree, bno);
2169 		blk <<= 1;
2170 		field = around[allocsiz];
2171 		subfield = inside[allocsiz];
2172 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
2173 			if ((blk & field) == subfield)
2174 				return (bno + pos);
2175 			field <<= 1;
2176 			subfield <<= 1;
2177 		}
2178 	}
2179 	panic("%s: block not in map: bno=%d, fs=%s", __func__,
2180 	    bno, fs->fs_fsmnt);
2181 	/* return (-1); */
2182 }
2183 
2184 /*
2185  * Fserr prints the name of a file system with an error diagnostic.
2186  *
2187  * The form of the error message is:
2188  *	fs: error message
2189  */
2190 static void
2191 ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp)
2192 {
2193 	KASSERT(cred != NULL);
2194 
2195 	if (cred == NOCRED || cred == FSCRED) {
2196 		log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
2197 		    curproc->p_pid, curproc->p_comm,
2198 		    fs->fs_fsmnt, cp);
2199 	} else {
2200 		log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
2201 		    kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
2202 		    fs->fs_fsmnt, cp);
2203 	}
2204 }
2205