1 /* $NetBSD: ffs_alloc.c,v 1.170 2021/09/03 21:55:01 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2002 Networks Associates Technology, Inc. 34 * All rights reserved. 35 * 36 * This software was developed for the FreeBSD Project by Marshall 37 * Kirk McKusick and Network Associates Laboratories, the Security 38 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 39 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 40 * research program 41 * 42 * Copyright (c) 1982, 1986, 1989, 1993 43 * The Regents of the University of California. All rights reserved. 44 * 45 * Redistribution and use in source and binary forms, with or without 46 * modification, are permitted provided that the following conditions 47 * are met: 48 * 1. Redistributions of source code must retain the above copyright 49 * notice, this list of conditions and the following disclaimer. 50 * 2. Redistributions in binary form must reproduce the above copyright 51 * notice, this list of conditions and the following disclaimer in the 52 * documentation and/or other materials provided with the distribution. 53 * 3. Neither the name of the University nor the names of its contributors 54 * may be used to endorse or promote products derived from this software 55 * without specific prior written permission. 56 * 57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * SUCH DAMAGE. 68 * 69 * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.170 2021/09/03 21:55:01 andvar Exp $"); 74 75 #if defined(_KERNEL_OPT) 76 #include "opt_ffs.h" 77 #include "opt_quota.h" 78 #include "opt_uvm_page_trkown.h" 79 #endif 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/buf.h> 84 #include <sys/cprng.h> 85 #include <sys/kauth.h> 86 #include <sys/kernel.h> 87 #include <sys/mount.h> 88 #include <sys/proc.h> 89 #include <sys/syslog.h> 90 #include <sys/vnode.h> 91 #include <sys/wapbl.h> 92 #include <sys/cprng.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/ufsmount.h> 97 #include <ufs/ufs/inode.h> 98 #include <ufs/ufs/ufs_extern.h> 99 #include <ufs/ufs/ufs_bswap.h> 100 #include <ufs/ufs/ufs_wapbl.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 105 #ifdef UVM_PAGE_TRKOWN 106 #include <uvm/uvm_object.h> 107 #include <uvm/uvm_page.h> 108 #endif 109 110 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int, int); 111 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int); 112 static ino_t ffs_dirpref(struct inode *); 113 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); 114 static void ffs_fserr(struct fs *, kauth_cred_t, const char *); 115 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, int, 116 daddr_t (*)(struct inode *, int, daddr_t, int, int, int)); 117 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int, int); 118 static int32_t ffs_mapsearch(struct fs *, struct cg *, 119 daddr_t, int); 120 static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, 121 daddr_t, long, bool); 122 static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, 123 int, bool); 124 125 /* if 1, changes in optimalization strategy are logged */ 126 int ffs_log_changeopt = 0; 127 128 /* in ffs_tables.c */ 129 extern const int inside[], around[]; 130 extern const u_char * const fragtbl[]; 131 132 /* Basic consistency check for block allocations */ 133 static int 134 ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, 135 long size, dev_t dev, ino_t inum) 136 { 137 if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 || 138 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) { 139 panic("%s: bad size: dev = 0x%llx, bno = %" PRId64 140 " bsize = %d, size = %ld, fs = %s", func, 141 (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); 142 } 143 144 if (bno >= fs->fs_size) { 145 printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno, 146 (unsigned long long)inum); 147 ffs_fserr(fs, NOCRED, "bad block"); 148 return EINVAL; 149 } 150 return 0; 151 } 152 153 /* 154 * Allocate a block in the file system. 155 * 156 * The size of the requested block is given, which must be some 157 * multiple of fs_fsize and <= fs_bsize. 158 * A preference may be optionally specified. If a preference is given 159 * the following hierarchy is used to allocate a block: 160 * 1) allocate the requested block. 161 * 2) allocate a rotationally optimal block in the same cylinder. 162 * 3) allocate a block in the same cylinder group. 163 * 4) quadradically rehash into other cylinder groups, until an 164 * available block is located. 165 * If no block preference is given the following hierarchy is used 166 * to allocate a block: 167 * 1) allocate a block in the cylinder group that contains the 168 * inode for the file. 169 * 2) quadradically rehash into other cylinder groups, until an 170 * available block is located. 171 * 172 * => called with um_lock held 173 * => releases um_lock before returning 174 */ 175 int 176 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, 177 int flags, kauth_cred_t cred, daddr_t *bnp) 178 { 179 struct ufsmount *ump; 180 struct fs *fs; 181 daddr_t bno; 182 int cg; 183 #if defined(QUOTA) || defined(QUOTA2) 184 int error; 185 #endif 186 187 fs = ip->i_fs; 188 ump = ip->i_ump; 189 190 KASSERT(mutex_owned(&ump->um_lock)); 191 192 #ifdef UVM_PAGE_TRKOWN 193 194 /* 195 * Sanity-check that allocations within the file size 196 * do not allow other threads to read the stale contents 197 * of newly allocated blocks. 198 * Usually pages will exist to cover the new allocation. 199 * There is an optimization in ffs_write() where we skip 200 * creating pages if several conditions are met: 201 * - the file must not be mapped (in any user address space). 202 * - the write must cover whole pages and whole blocks. 203 * If those conditions are not met then pages must exist and 204 * be locked by the current thread. 205 */ 206 207 struct vnode *vp = ITOV(ip); 208 if (vp->v_type == VREG && (flags & IO_EXT) == 0 && 209 ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) && 210 ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 || 211 ffs_blkoff(fs, size) != 0)) { 212 struct vm_page *pg __diagused; 213 struct uvm_object *uobj = &vp->v_uobj; 214 voff_t off = trunc_page(ffs_lblktosize(fs, lbn)); 215 voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size); 216 217 rw_enter(uobj->vmobjlock, RW_WRITER); 218 while (off < endoff) { 219 pg = uvm_pagelookup(uobj, off); 220 KASSERT((pg != NULL && pg->owner_tag != NULL && 221 pg->owner == curproc->p_pid && 222 pg->lowner == curlwp->l_lid)); 223 off += PAGE_SIZE; 224 } 225 rw_exit(uobj->vmobjlock); 226 } 227 #endif 228 229 *bnp = 0; 230 231 KASSERTMSG((cred != NOCRED), "missing credential"); 232 KASSERTMSG(((u_int)size <= fs->fs_bsize), 233 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 234 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 235 KASSERTMSG((ffs_fragoff(fs, size) == 0), 236 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 237 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 238 239 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 240 goto nospace; 241 if (freespace(fs, fs->fs_minfree) <= 0 && 242 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 243 NULL, NULL) != 0) 244 goto nospace; 245 #if defined(QUOTA) || defined(QUOTA2) 246 mutex_exit(&ump->um_lock); 247 if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) 248 return (error); 249 mutex_enter(&ump->um_lock); 250 #endif 251 252 if (bpref >= fs->fs_size) 253 bpref = 0; 254 if (bpref == 0) 255 cg = ino_to_cg(fs, ip->i_number); 256 else 257 cg = dtog(fs, bpref); 258 bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg); 259 if (bno > 0) { 260 DIP_ADD(ip, blocks, btodb(size)); 261 if (flags & IO_EXT) 262 ip->i_flag |= IN_CHANGE; 263 else 264 ip->i_flag |= IN_CHANGE | IN_UPDATE; 265 *bnp = bno; 266 return (0); 267 } 268 #if defined(QUOTA) || defined(QUOTA2) 269 /* 270 * Restore user's disk quota because allocation failed. 271 */ 272 (void) chkdq(ip, -btodb(size), cred, FORCE); 273 #endif 274 if (flags & B_CONTIG) { 275 /* 276 * XXX ump->um_lock handling is "suspect" at best. 277 * For the case where ffs_hashalloc() fails early 278 * in the B_CONTIG case we reach here with um_lock 279 * already unlocked, so we can't release it again 280 * like in the normal error path. See kern/39206. 281 * 282 * 283 * Fail silently - it's up to our caller to report 284 * errors. 285 */ 286 return (ENOSPC); 287 } 288 nospace: 289 mutex_exit(&ump->um_lock); 290 ffs_fserr(fs, cred, "file system full"); 291 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 292 return (ENOSPC); 293 } 294 295 /* 296 * Reallocate a fragment to a bigger size 297 * 298 * The number and size of the old block is given, and a preference 299 * and new size is also specified. The allocator attempts to extend 300 * the original block. Failing that, the regular block allocator is 301 * invoked to get an appropriate block. 302 * 303 * => called with um_lock held 304 * => return with um_lock released 305 */ 306 int 307 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref, 308 int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp, 309 daddr_t *blknop) 310 { 311 struct ufsmount *ump; 312 struct fs *fs; 313 struct buf *bp; 314 int cg, request, error; 315 daddr_t bno; 316 317 fs = ip->i_fs; 318 ump = ip->i_ump; 319 320 KASSERT(mutex_owned(&ump->um_lock)); 321 322 #ifdef UVM_PAGE_TRKOWN 323 324 /* 325 * Sanity-check that allocations within the file size 326 * do not allow other threads to read the stale contents 327 * of newly allocated blocks. 328 * Unlike in ffs_alloc(), here pages must always exist 329 * for such allocations, because only the last block of a file 330 * can be a fragment and ffs_write() will reallocate the 331 * fragment to the new size using ufs_balloc_range(), 332 * which always creates pages to cover blocks it allocates. 333 */ 334 335 if (ITOV(ip)->v_type == VREG) { 336 struct vm_page *pg __diagused; 337 struct uvm_object *uobj = &ITOV(ip)->v_uobj; 338 voff_t off = trunc_page(ffs_lblktosize(fs, lbprev)); 339 voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize); 340 341 rw_enter(uobj->vmobjlock, RW_WRITER); 342 while (off < endoff) { 343 pg = uvm_pagelookup(uobj, off); 344 KASSERT(pg->owner == curproc->p_pid && 345 pg->lowner == curlwp->l_lid); 346 off += PAGE_SIZE; 347 } 348 rw_exit(uobj->vmobjlock); 349 } 350 #endif 351 352 KASSERTMSG((cred != NOCRED), "missing credential"); 353 KASSERTMSG(((u_int)osize <= fs->fs_bsize), 354 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 355 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 356 fs->fs_fsmnt); 357 KASSERTMSG((ffs_fragoff(fs, osize) == 0), 358 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 359 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 360 fs->fs_fsmnt); 361 KASSERTMSG(((u_int)nsize <= fs->fs_bsize), 362 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 363 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 364 fs->fs_fsmnt); 365 KASSERTMSG((ffs_fragoff(fs, nsize) == 0), 366 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 367 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 368 fs->fs_fsmnt); 369 370 if (freespace(fs, fs->fs_minfree) <= 0 && 371 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 372 NULL, NULL) != 0) { 373 mutex_exit(&ump->um_lock); 374 goto nospace; 375 } 376 377 if (bprev == 0) { 378 panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %" 379 PRId64 ", fs = %s", __func__, 380 (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, 381 fs->fs_fsmnt); 382 } 383 mutex_exit(&ump->um_lock); 384 385 /* 386 * Allocate the extra space in the buffer. 387 */ 388 if (bpp != NULL && 389 (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) { 390 return (error); 391 } 392 #if defined(QUOTA) || defined(QUOTA2) 393 if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { 394 if (bpp != NULL) { 395 brelse(bp, 0); 396 } 397 return (error); 398 } 399 #endif 400 /* 401 * Check for extension in the existing location. 402 */ 403 cg = dtog(fs, bprev); 404 mutex_enter(&ump->um_lock); 405 if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { 406 DIP_ADD(ip, blocks, btodb(nsize - osize)); 407 if (flags & IO_EXT) 408 ip->i_flag |= IN_CHANGE; 409 else 410 ip->i_flag |= IN_CHANGE | IN_UPDATE; 411 412 if (bpp != NULL) { 413 if (bp->b_blkno != FFS_FSBTODB(fs, bno)) { 414 panic("%s: bad blockno %#llx != %#llx", 415 __func__, (unsigned long long) bp->b_blkno, 416 (unsigned long long)FFS_FSBTODB(fs, bno)); 417 } 418 allocbuf(bp, nsize, 1); 419 memset((char *)bp->b_data + osize, 0, nsize - osize); 420 mutex_enter(bp->b_objlock); 421 KASSERT(!cv_has_waiters(&bp->b_done)); 422 bp->b_oflags |= BO_DONE; 423 mutex_exit(bp->b_objlock); 424 *bpp = bp; 425 } 426 if (blknop != NULL) { 427 *blknop = bno; 428 } 429 return (0); 430 } 431 /* 432 * Allocate a new disk location. 433 */ 434 if (bpref >= fs->fs_size) 435 bpref = 0; 436 switch ((int)fs->fs_optim) { 437 case FS_OPTSPACE: 438 /* 439 * Allocate an exact sized fragment. Although this makes 440 * best use of space, we will waste time relocating it if 441 * the file continues to grow. If the fragmentation is 442 * less than half of the minimum free reserve, we choose 443 * to begin optimizing for time. 444 */ 445 request = nsize; 446 if (fs->fs_minfree < 5 || 447 fs->fs_cstotal.cs_nffree > 448 fs->fs_dsize * fs->fs_minfree / (2 * 100)) 449 break; 450 451 if (ffs_log_changeopt) { 452 log(LOG_NOTICE, 453 "%s: optimization changed from SPACE to TIME\n", 454 fs->fs_fsmnt); 455 } 456 457 fs->fs_optim = FS_OPTTIME; 458 break; 459 case FS_OPTTIME: 460 /* 461 * At this point we have discovered a file that is trying to 462 * grow a small fragment to a larger fragment. To save time, 463 * we allocate a full sized block, then free the unused portion. 464 * If the file continues to grow, the `ffs_fragextend' call 465 * above will be able to grow it in place without further 466 * copying. If aberrant programs cause disk fragmentation to 467 * grow within 2% of the free reserve, we choose to begin 468 * optimizing for space. 469 */ 470 request = fs->fs_bsize; 471 if (fs->fs_cstotal.cs_nffree < 472 fs->fs_dsize * (fs->fs_minfree - 2) / 100) 473 break; 474 475 if (ffs_log_changeopt) { 476 log(LOG_NOTICE, 477 "%s: optimization changed from TIME to SPACE\n", 478 fs->fs_fsmnt); 479 } 480 481 fs->fs_optim = FS_OPTSPACE; 482 break; 483 default: 484 panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s", 485 __func__, (unsigned long long)ip->i_dev, fs->fs_optim, 486 fs->fs_fsmnt); 487 /* NOTREACHED */ 488 } 489 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); 490 if (bno > 0) { 491 /* 492 * Use forced deallocation registration, we can't handle 493 * failure here. This is safe, as this place is ever hit 494 * maximum once per write operation, when fragment is extended 495 * to longer fragment, or a full block. 496 */ 497 if ((ip->i_ump->um_mountp->mnt_wapbl) && 498 (ITOV(ip)->v_type != VREG)) { 499 /* this should never fail */ 500 error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE( 501 ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev), 502 osize); 503 if (error) 504 panic("ffs_realloccg: dealloc registration failed"); 505 } else { 506 ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, 507 ip->i_number); 508 } 509 DIP_ADD(ip, blocks, btodb(nsize - osize)); 510 if (flags & IO_EXT) 511 ip->i_flag |= IN_CHANGE; 512 else 513 ip->i_flag |= IN_CHANGE | IN_UPDATE; 514 if (bpp != NULL) { 515 bp->b_blkno = FFS_FSBTODB(fs, bno); 516 allocbuf(bp, nsize, 1); 517 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); 518 mutex_enter(bp->b_objlock); 519 KASSERT(!cv_has_waiters(&bp->b_done)); 520 bp->b_oflags |= BO_DONE; 521 mutex_exit(bp->b_objlock); 522 *bpp = bp; 523 } 524 if (blknop != NULL) { 525 *blknop = bno; 526 } 527 return (0); 528 } 529 mutex_exit(&ump->um_lock); 530 531 #if defined(QUOTA) || defined(QUOTA2) 532 /* 533 * Restore user's disk quota because allocation failed. 534 */ 535 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 536 #endif 537 if (bpp != NULL) { 538 brelse(bp, 0); 539 } 540 541 nospace: 542 /* 543 * no space available 544 */ 545 ffs_fserr(fs, cred, "file system full"); 546 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 547 return (ENOSPC); 548 } 549 550 /* 551 * Allocate an inode in the file system. 552 * 553 * If allocating a directory, use ffs_dirpref to select the inode. 554 * If allocating in a directory, the following hierarchy is followed: 555 * 1) allocate the preferred inode. 556 * 2) allocate an inode in the same cylinder group. 557 * 3) quadradically rehash into other cylinder groups, until an 558 * available inode is located. 559 * If no inode preference is given the following hierarchy is used 560 * to allocate an inode: 561 * 1) allocate an inode in cylinder group 0. 562 * 2) quadradically rehash into other cylinder groups, until an 563 * available inode is located. 564 * 565 * => um_lock not held upon entry or return 566 */ 567 int 568 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop) 569 { 570 struct ufsmount *ump; 571 struct inode *pip; 572 struct fs *fs; 573 ino_t ino, ipref; 574 int cg, error; 575 576 UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); 577 578 pip = VTOI(pvp); 579 fs = pip->i_fs; 580 ump = pip->i_ump; 581 582 error = UFS_WAPBL_BEGIN(pvp->v_mount); 583 if (error) { 584 return error; 585 } 586 mutex_enter(&ump->um_lock); 587 if (fs->fs_cstotal.cs_nifree == 0) 588 goto noinodes; 589 590 if ((mode & IFMT) == IFDIR) 591 ipref = ffs_dirpref(pip); 592 else 593 ipref = pip->i_number; 594 if (ipref >= fs->fs_ncg * fs->fs_ipg) 595 ipref = 0; 596 cg = ino_to_cg(fs, ipref); 597 /* 598 * Track number of dirs created one after another 599 * in a same cg without intervening by files. 600 */ 601 if ((mode & IFMT) == IFDIR) { 602 if (fs->fs_contigdirs[cg] < 255) 603 fs->fs_contigdirs[cg]++; 604 } else { 605 if (fs->fs_contigdirs[cg] > 0) 606 fs->fs_contigdirs[cg]--; 607 } 608 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg); 609 if (ino == 0) 610 goto noinodes; 611 UFS_WAPBL_END(pvp->v_mount); 612 *inop = ino; 613 return 0; 614 615 noinodes: 616 mutex_exit(&ump->um_lock); 617 UFS_WAPBL_END(pvp->v_mount); 618 ffs_fserr(fs, cred, "out of inodes"); 619 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); 620 return ENOSPC; 621 } 622 623 /* 624 * Find a cylinder group in which to place a directory. 625 * 626 * The policy implemented by this algorithm is to allocate a 627 * directory inode in the same cylinder group as its parent 628 * directory, but also to reserve space for its files inodes 629 * and data. Restrict the number of directories which may be 630 * allocated one after another in the same cylinder group 631 * without intervening allocation of files. 632 * 633 * If we allocate a first level directory then force allocation 634 * in another cylinder group. 635 */ 636 static ino_t 637 ffs_dirpref(struct inode *pip) 638 { 639 register struct fs *fs; 640 int cg, prefcg; 641 int64_t dirsize, cgsize, curdsz; 642 int avgifree, avgbfree, avgndir; 643 int minifree, minbfree, maxndir; 644 int mincg, minndir; 645 int maxcontigdirs; 646 647 KASSERT(mutex_owned(&pip->i_ump->um_lock)); 648 649 fs = pip->i_fs; 650 651 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 652 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 653 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 654 655 /* 656 * Force allocation in another cg if creating a first level dir. 657 */ 658 if (ITOV(pip)->v_vflag & VV_ROOT) { 659 prefcg = cprng_fast32() % fs->fs_ncg; 660 mincg = prefcg; 661 minndir = fs->fs_ipg; 662 for (cg = prefcg; cg < fs->fs_ncg; cg++) 663 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 664 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 665 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 666 mincg = cg; 667 minndir = fs->fs_cs(fs, cg).cs_ndir; 668 } 669 for (cg = 0; cg < prefcg; cg++) 670 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 671 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 672 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 673 mincg = cg; 674 minndir = fs->fs_cs(fs, cg).cs_ndir; 675 } 676 return ((ino_t)(fs->fs_ipg * mincg)); 677 } 678 679 /* 680 * Count various limits which used for 681 * optimal allocation of a directory inode. 682 * Try cylinder groups with >75% avgifree and avgbfree. 683 * Avoid cylinder groups with no free blocks or inodes as that 684 * triggers an I/O-expensive cylinder group scan. 685 */ 686 maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 687 minifree = avgifree - avgifree / 4; 688 if (minifree < 1) 689 minifree = 1; 690 minbfree = avgbfree - avgbfree / 4; 691 if (minbfree < 1) 692 minbfree = 1; 693 cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; 694 dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; 695 if (avgndir != 0) { 696 curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; 697 if (dirsize < curdsz) 698 dirsize = curdsz; 699 } 700 if (cgsize < dirsize * 255) 701 maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize; 702 else 703 maxcontigdirs = 255; 704 if (fs->fs_avgfpdir > 0) 705 maxcontigdirs = uimin(maxcontigdirs, 706 fs->fs_ipg / fs->fs_avgfpdir); 707 if (maxcontigdirs == 0) 708 maxcontigdirs = 1; 709 710 /* 711 * Limit number of dirs in one cg and reserve space for 712 * regular files, but only if we have no deficit in 713 * inodes or space. 714 */ 715 prefcg = ino_to_cg(fs, pip->i_number); 716 for (cg = prefcg; cg < fs->fs_ncg; cg++) 717 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 718 fs->fs_cs(fs, cg).cs_nifree >= minifree && 719 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 720 if (fs->fs_contigdirs[cg] < maxcontigdirs) 721 return ((ino_t)(fs->fs_ipg * cg)); 722 } 723 for (cg = 0; cg < prefcg; cg++) 724 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 725 fs->fs_cs(fs, cg).cs_nifree >= minifree && 726 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 727 if (fs->fs_contigdirs[cg] < maxcontigdirs) 728 return ((ino_t)(fs->fs_ipg * cg)); 729 } 730 /* 731 * This is a backstop when we are deficient in space. 732 */ 733 for (cg = prefcg; cg < fs->fs_ncg; cg++) 734 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 735 return ((ino_t)(fs->fs_ipg * cg)); 736 for (cg = 0; cg < prefcg; cg++) 737 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 738 break; 739 return ((ino_t)(fs->fs_ipg * cg)); 740 } 741 742 /* 743 * Select the desired position for the next block in a file. The file is 744 * logically divided into sections. The first section is composed of the 745 * direct blocks. Each additional section contains fs_maxbpg blocks. 746 * 747 * If no blocks have been allocated in the first section, the policy is to 748 * request a block in the same cylinder group as the inode that describes 749 * the file. If no blocks have been allocated in any other section, the 750 * policy is to place the section in a cylinder group with a greater than 751 * average number of free blocks. An appropriate cylinder group is found 752 * by using a rotor that sweeps the cylinder groups. When a new group of 753 * blocks is needed, the sweep begins in the cylinder group following the 754 * cylinder group from which the previous allocation was made. The sweep 755 * continues until a cylinder group with greater than the average number 756 * of free blocks is found. If the allocation is for the first block in an 757 * indirect block, the information on the previous allocation is unavailable; 758 * here a best guess is made based upon the logical block number being 759 * allocated. 760 * 761 * If a section is already partially allocated, the policy is to 762 * contiguously allocate fs_maxcontig blocks. The end of one of these 763 * contiguous blocks and the beginning of the next is laid out 764 * contigously if possible. 765 * 766 * => um_lock held on entry and exit 767 */ 768 daddr_t 769 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, 770 int32_t *bap /* XXX ondisk32 */) 771 { 772 struct fs *fs; 773 int cg; 774 int avgbfree, startcg; 775 776 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 777 778 fs = ip->i_fs; 779 780 /* 781 * If allocating a contiguous file with B_CONTIG, use the hints 782 * in the inode extensions to return the desired block. 783 * 784 * For metadata (indirect blocks) return the address of where 785 * the first indirect block resides - we'll scan for the next 786 * available slot if we need to allocate more than one indirect 787 * block. For data, return the address of the actual block 788 * relative to the address of the first data block. 789 */ 790 if (flags & B_CONTIG) { 791 KASSERT(ip->i_ffs_first_data_blk != 0); 792 KASSERT(ip->i_ffs_first_indir_blk != 0); 793 if (flags & B_METAONLY) 794 return ip->i_ffs_first_indir_blk; 795 else 796 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 797 } 798 799 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 800 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 801 cg = ino_to_cg(fs, ip->i_number); 802 return (cgbase(fs, cg) + fs->fs_frag); 803 } 804 /* 805 * Find a cylinder with greater than average number of 806 * unused data blocks. 807 */ 808 if (indx == 0 || bap[indx - 1] == 0) 809 startcg = 810 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 811 else 812 startcg = dtog(fs, 813 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 814 startcg %= fs->fs_ncg; 815 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 816 for (cg = startcg; cg < fs->fs_ncg; cg++) 817 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 818 return (cgbase(fs, cg) + fs->fs_frag); 819 } 820 for (cg = 0; cg < startcg; cg++) 821 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 822 return (cgbase(fs, cg) + fs->fs_frag); 823 } 824 return (0); 825 } 826 /* 827 * We just always try to lay things out contiguously. 828 */ 829 return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 830 } 831 832 daddr_t 833 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, 834 int64_t *bap) 835 { 836 struct fs *fs; 837 int cg; 838 int avgbfree, startcg; 839 840 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 841 842 fs = ip->i_fs; 843 844 /* 845 * If allocating a contiguous file with B_CONTIG, use the hints 846 * in the inode extensions to return the desired block. 847 * 848 * For metadata (indirect blocks) return the address of where 849 * the first indirect block resides - we'll scan for the next 850 * available slot if we need to allocate more than one indirect 851 * block. For data, return the address of the actual block 852 * relative to the address of the first data block. 853 */ 854 if (flags & B_CONTIG) { 855 KASSERT(ip->i_ffs_first_data_blk != 0); 856 KASSERT(ip->i_ffs_first_indir_blk != 0); 857 if (flags & B_METAONLY) 858 return ip->i_ffs_first_indir_blk; 859 else 860 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 861 } 862 863 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 864 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 865 cg = ino_to_cg(fs, ip->i_number); 866 return (cgbase(fs, cg) + fs->fs_frag); 867 } 868 /* 869 * Find a cylinder with greater than average number of 870 * unused data blocks. 871 */ 872 if (indx == 0 || bap[indx - 1] == 0) 873 startcg = 874 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 875 else 876 startcg = dtog(fs, 877 ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 878 startcg %= fs->fs_ncg; 879 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 880 for (cg = startcg; cg < fs->fs_ncg; cg++) 881 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 882 return (cgbase(fs, cg) + fs->fs_frag); 883 } 884 for (cg = 0; cg < startcg; cg++) 885 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 886 return (cgbase(fs, cg) + fs->fs_frag); 887 } 888 return (0); 889 } 890 /* 891 * We just always try to lay things out contiguously. 892 */ 893 return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 894 } 895 896 897 /* 898 * Implement the cylinder overflow algorithm. 899 * 900 * The policy implemented by this algorithm is: 901 * 1) allocate the block in its requested cylinder group. 902 * 2) quadradically rehash on the cylinder group number. 903 * 3) brute force search for a free block. 904 * 905 * => called with um_lock held 906 * => returns with um_lock released on success, held on failure 907 * (*allocator releases lock on success, retains lock on failure) 908 */ 909 /*VARARGS5*/ 910 static daddr_t 911 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, 912 int size /* size for data blocks, mode for inodes */, 913 int realsize, 914 int flags, 915 daddr_t (*allocator)(struct inode *, int, daddr_t, int, int, int)) 916 { 917 struct fs *fs; 918 daddr_t result; 919 int i, icg = cg; 920 921 fs = ip->i_fs; 922 /* 923 * 1: preferred cylinder group 924 */ 925 result = (*allocator)(ip, cg, pref, size, realsize, flags); 926 if (result) 927 return (result); 928 929 if (flags & B_CONTIG) 930 return (result); 931 /* 932 * 2: quadratic rehash 933 */ 934 for (i = 1; i < fs->fs_ncg; i *= 2) { 935 cg += i; 936 if (cg >= fs->fs_ncg) 937 cg -= fs->fs_ncg; 938 result = (*allocator)(ip, cg, 0, size, realsize, flags); 939 if (result) 940 return (result); 941 } 942 /* 943 * 3: brute force search 944 * Note that we start at i == 2, since 0 was checked initially, 945 * and 1 is always checked in the quadratic rehash. 946 */ 947 cg = (icg + 2) % fs->fs_ncg; 948 for (i = 2; i < fs->fs_ncg; i++) { 949 result = (*allocator)(ip, cg, 0, size, realsize, flags); 950 if (result) 951 return (result); 952 cg++; 953 if (cg == fs->fs_ncg) 954 cg = 0; 955 } 956 return (0); 957 } 958 959 /* 960 * Determine whether a fragment can be extended. 961 * 962 * Check to see if the necessary fragments are available, and 963 * if they are, allocate them. 964 * 965 * => called with um_lock held 966 * => returns with um_lock released on success, held on failure 967 */ 968 static daddr_t 969 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize) 970 { 971 struct ufsmount *ump; 972 struct fs *fs; 973 struct cg *cgp; 974 struct buf *bp; 975 daddr_t bno; 976 int frags, bbase; 977 int i, error; 978 u_int8_t *blksfree; 979 980 fs = ip->i_fs; 981 ump = ip->i_ump; 982 983 KASSERT(mutex_owned(&ump->um_lock)); 984 985 if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize)) 986 return (0); 987 frags = ffs_numfrags(fs, nsize); 988 bbase = ffs_fragnum(fs, bprev); 989 if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) { 990 /* cannot extend across a block boundary */ 991 return (0); 992 } 993 mutex_exit(&ump->um_lock); 994 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 995 (int)fs->fs_cgsize, B_MODIFY, &bp); 996 if (error) 997 goto fail; 998 cgp = (struct cg *)bp->b_data; 999 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) 1000 goto fail; 1001 cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); 1002 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1003 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1004 cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); 1005 bno = dtogd(fs, bprev); 1006 blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); 1007 for (i = ffs_numfrags(fs, osize); i < frags; i++) 1008 if (isclr(blksfree, bno + i)) 1009 goto fail; 1010 /* 1011 * the current fragment can be extended 1012 * deduct the count on fragment being extended into 1013 * increase the count on the remaining fragment (if any) 1014 * allocate the extended piece 1015 */ 1016 for (i = frags; i < fs->fs_frag - bbase; i++) 1017 if (isclr(blksfree, bno + i)) 1018 break; 1019 ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); 1020 if (i != frags) 1021 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); 1022 mutex_enter(&ump->um_lock); 1023 for (i = ffs_numfrags(fs, osize); i < frags; i++) { 1024 clrbit(blksfree, bno + i); 1025 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); 1026 fs->fs_cstotal.cs_nffree--; 1027 fs->fs_cs(fs, cg).cs_nffree--; 1028 } 1029 fs->fs_fmod = 1; 1030 ACTIVECG_CLR(fs, cg); 1031 mutex_exit(&ump->um_lock); 1032 bdwrite(bp); 1033 return (bprev); 1034 1035 fail: 1036 if (bp != NULL) 1037 brelse(bp, 0); 1038 mutex_enter(&ump->um_lock); 1039 return (0); 1040 } 1041 1042 /* 1043 * Determine whether a block can be allocated. 1044 * 1045 * Check to see if a block of the appropriate size is available, 1046 * and if it is, allocate it. 1047 */ 1048 static daddr_t 1049 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int realsize, 1050 int flags) 1051 { 1052 struct ufsmount *ump; 1053 struct fs *fs = ip->i_fs; 1054 struct cg *cgp; 1055 struct buf *bp; 1056 int32_t bno; 1057 daddr_t blkno; 1058 int error, frags, allocsiz, i; 1059 u_int8_t *blksfree; 1060 const int needswap = UFS_FSNEEDSWAP(fs); 1061 1062 ump = ip->i_ump; 1063 1064 KASSERT(mutex_owned(&ump->um_lock)); 1065 1066 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1067 return (0); 1068 mutex_exit(&ump->um_lock); 1069 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1070 (int)fs->fs_cgsize, B_MODIFY, &bp); 1071 if (error) 1072 goto fail; 1073 cgp = (struct cg *)bp->b_data; 1074 if (!cg_chkmagic(cgp, needswap) || 1075 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1076 goto fail; 1077 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1078 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1079 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1080 cgp->cg_time = ufs_rw64(time_second, needswap); 1081 if (size == fs->fs_bsize) { 1082 mutex_enter(&ump->um_lock); 1083 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1084 ACTIVECG_CLR(fs, cg); 1085 mutex_exit(&ump->um_lock); 1086 1087 /* 1088 * If actually needed size is lower, free the extra blocks now. 1089 * This is safe to call here, there is no outside reference 1090 * to this block yet. It is not necessary to keep um_lock 1091 * locked. 1092 */ 1093 if (realsize != 0 && realsize < size) { 1094 ffs_blkfree_common(ip->i_ump, ip->i_fs, 1095 ip->i_devvp->v_rdev, 1096 bp, blkno + ffs_numfrags(fs, realsize), 1097 (long)(size - realsize), false); 1098 } 1099 1100 bdwrite(bp); 1101 return (blkno); 1102 } 1103 /* 1104 * check to see if any fragments are already available 1105 * allocsiz is the size which will be allocated, hacking 1106 * it down to a smaller size if necessary 1107 */ 1108 blksfree = cg_blksfree(cgp, needswap); 1109 frags = ffs_numfrags(fs, size); 1110 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1111 if (cgp->cg_frsum[allocsiz] != 0) 1112 break; 1113 if (allocsiz == fs->fs_frag) { 1114 /* 1115 * no fragments were available, so a block will be 1116 * allocated, and hacked up 1117 */ 1118 if (cgp->cg_cs.cs_nbfree == 0) 1119 goto fail; 1120 mutex_enter(&ump->um_lock); 1121 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1122 bno = dtogd(fs, blkno); 1123 for (i = frags; i < fs->fs_frag; i++) 1124 setbit(blksfree, bno + i); 1125 i = fs->fs_frag - frags; 1126 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1127 fs->fs_cstotal.cs_nffree += i; 1128 fs->fs_cs(fs, cg).cs_nffree += i; 1129 fs->fs_fmod = 1; 1130 ufs_add32(cgp->cg_frsum[i], 1, needswap); 1131 ACTIVECG_CLR(fs, cg); 1132 mutex_exit(&ump->um_lock); 1133 bdwrite(bp); 1134 return (blkno); 1135 } 1136 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1137 #if 0 1138 /* 1139 * XXX fvdl mapsearch will panic, and never return -1 1140 * also: returning NULL as daddr_t ? 1141 */ 1142 if (bno < 0) 1143 goto fail; 1144 #endif 1145 for (i = 0; i < frags; i++) 1146 clrbit(blksfree, bno + i); 1147 mutex_enter(&ump->um_lock); 1148 ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); 1149 fs->fs_cstotal.cs_nffree -= frags; 1150 fs->fs_cs(fs, cg).cs_nffree -= frags; 1151 fs->fs_fmod = 1; 1152 ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); 1153 if (frags != allocsiz) 1154 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); 1155 blkno = cgbase(fs, cg) + bno; 1156 ACTIVECG_CLR(fs, cg); 1157 mutex_exit(&ump->um_lock); 1158 bdwrite(bp); 1159 return blkno; 1160 1161 fail: 1162 if (bp != NULL) 1163 brelse(bp, 0); 1164 mutex_enter(&ump->um_lock); 1165 return (0); 1166 } 1167 1168 /* 1169 * Allocate a block in a cylinder group. 1170 * 1171 * This algorithm implements the following policy: 1172 * 1) allocate the requested block. 1173 * 2) allocate a rotationally optimal block in the same cylinder. 1174 * 3) allocate the next available block on the block rotor for the 1175 * specified cylinder group. 1176 * Note that this routine only allocates fs_bsize blocks; these 1177 * blocks may be fragmented by the routine that allocates them. 1178 */ 1179 static daddr_t 1180 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize, 1181 int flags) 1182 { 1183 struct fs *fs = ip->i_fs; 1184 struct cg *cgp; 1185 int cg; 1186 daddr_t blkno; 1187 int32_t bno; 1188 u_int8_t *blksfree; 1189 const int needswap = UFS_FSNEEDSWAP(fs); 1190 1191 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 1192 1193 cgp = (struct cg *)bp->b_data; 1194 blksfree = cg_blksfree(cgp, needswap); 1195 if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { 1196 bpref = ufs_rw32(cgp->cg_rotor, needswap); 1197 } else { 1198 bpref = ffs_blknum(fs, bpref); 1199 bno = dtogd(fs, bpref); 1200 /* 1201 * if the requested block is available, use it 1202 */ 1203 if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno))) 1204 goto gotit; 1205 /* 1206 * if the requested data block isn't available and we are 1207 * trying to allocate a contiguous file, return an error. 1208 */ 1209 if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) 1210 return (0); 1211 } 1212 1213 /* 1214 * Take the next available block in this cylinder group. 1215 */ 1216 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1217 #if 0 1218 /* 1219 * XXX jdolecek ffs_mapsearch() succeeds or panics 1220 */ 1221 if (bno < 0) 1222 return (0); 1223 #endif 1224 cgp->cg_rotor = ufs_rw32(bno, needswap); 1225 gotit: 1226 blkno = ffs_fragstoblks(fs, bno); 1227 ffs_clrblock(fs, blksfree, blkno); 1228 ffs_clusteracct(fs, cgp, blkno, -1); 1229 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1230 fs->fs_cstotal.cs_nbfree--; 1231 fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; 1232 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1233 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1234 int cylno; 1235 cylno = old_cbtocylno(fs, bno); 1236 KASSERT(cylno >= 0); 1237 KASSERT(cylno < fs->fs_old_ncyl); 1238 KASSERT(old_cbtorpos(fs, bno) >= 0); 1239 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); 1240 ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, 1241 needswap); 1242 ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); 1243 } 1244 fs->fs_fmod = 1; 1245 cg = ufs_rw32(cgp->cg_cgx, needswap); 1246 blkno = cgbase(fs, cg) + bno; 1247 return (blkno); 1248 } 1249 1250 /* 1251 * Determine whether an inode can be allocated. 1252 * 1253 * Check to see if an inode is available, and if it is, 1254 * allocate it using the following policy: 1255 * 1) allocate the requested inode. 1256 * 2) allocate the next available inode after the requested 1257 * inode in the specified cylinder group. 1258 */ 1259 static daddr_t 1260 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int realsize, 1261 int flags) 1262 { 1263 struct ufsmount *ump = ip->i_ump; 1264 struct fs *fs = ip->i_fs; 1265 struct cg *cgp; 1266 struct buf *bp, *ibp; 1267 u_int8_t *inosused; 1268 int error, start, len, loc, map, i; 1269 int32_t initediblk, maxiblk, irotor; 1270 daddr_t nalloc; 1271 struct ufs2_dinode *dp2; 1272 const int needswap = UFS_FSNEEDSWAP(fs); 1273 1274 KASSERT(mutex_owned(&ump->um_lock)); 1275 UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); 1276 1277 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1278 return (0); 1279 mutex_exit(&ump->um_lock); 1280 ibp = NULL; 1281 if (fs->fs_magic == FS_UFS2_MAGIC) { 1282 initediblk = -1; 1283 } else { 1284 initediblk = fs->fs_ipg; 1285 } 1286 maxiblk = initediblk; 1287 1288 retry: 1289 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1290 (int)fs->fs_cgsize, B_MODIFY, &bp); 1291 if (error) 1292 goto fail; 1293 cgp = (struct cg *)bp->b_data; 1294 if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) 1295 goto fail; 1296 1297 if (ibp != NULL && 1298 initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { 1299 /* Another thread allocated more inodes so we retry the test. */ 1300 brelse(ibp, 0); 1301 ibp = NULL; 1302 } 1303 /* 1304 * Check to see if we need to initialize more inodes. 1305 */ 1306 if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { 1307 initediblk = ufs_rw32(cgp->cg_initediblk, needswap); 1308 maxiblk = initediblk; 1309 nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); 1310 if (nalloc + FFS_INOPB(fs) > initediblk && 1311 initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { 1312 /* 1313 * We have to release the cg buffer here to prevent 1314 * a deadlock when reading the inode block will 1315 * run a copy-on-write that might use this cg. 1316 */ 1317 brelse(bp, 0); 1318 bp = NULL; 1319 error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs, 1320 ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), 1321 FFS_NOBLK, fs->fs_bsize, false, &ibp); 1322 if (error) 1323 goto fail; 1324 1325 maxiblk += FFS_INOPB(fs); 1326 1327 goto retry; 1328 } 1329 } 1330 1331 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1332 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1333 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1334 cgp->cg_time = ufs_rw64(time_second, needswap); 1335 inosused = cg_inosused(cgp, needswap); 1336 1337 if (ipref) { 1338 ipref %= fs->fs_ipg; 1339 /* safeguard to stay in (to be) allocated range */ 1340 if (ipref < maxiblk && isclr(inosused, ipref)) 1341 goto gotit; 1342 } 1343 1344 irotor = ufs_rw32(cgp->cg_irotor, needswap); 1345 1346 KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d" 1347 " out of bounds, initediblk=%d", 1348 __func__, cg, irotor, initediblk); 1349 1350 start = irotor / NBBY; 1351 len = howmany(maxiblk - irotor, NBBY); 1352 loc = skpc(0xff, len, &inosused[start]); 1353 if (loc == 0) { 1354 len = start + 1; 1355 start = 0; 1356 loc = skpc(0xff, len, &inosused[0]); 1357 if (loc == 0) { 1358 panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s", 1359 __func__, cg, ufs_rw32(cgp->cg_irotor, needswap), 1360 fs->fs_fsmnt); 1361 /* NOTREACHED */ 1362 } 1363 } 1364 i = start + len - loc; 1365 map = inosused[i] ^ 0xff; 1366 if (map == 0) { 1367 panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt); 1368 } 1369 1370 ipref = i * NBBY + ffs(map) - 1; 1371 1372 cgp->cg_irotor = ufs_rw32(ipref, needswap); 1373 1374 gotit: 1375 KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to " 1376 "allocate inode index %d beyond max allocated index %d" 1377 " of %d inodes/cg", 1378 __func__, cg, (int)ipref, maxiblk, cgp->cg_niblk); 1379 1380 UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, 1381 mode); 1382 /* 1383 * Check to see if we need to initialize more inodes. 1384 */ 1385 if (ibp != NULL) { 1386 KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); 1387 memset(ibp->b_data, 0, fs->fs_bsize); 1388 dp2 = (struct ufs2_dinode *)(ibp->b_data); 1389 for (i = 0; i < FFS_INOPB(fs); i++) { 1390 /* 1391 * Don't bother to swap, it's supposed to be 1392 * random, after all. 1393 */ 1394 dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; 1395 dp2++; 1396 } 1397 initediblk += FFS_INOPB(fs); 1398 cgp->cg_initediblk = ufs_rw32(initediblk, needswap); 1399 } 1400 1401 mutex_enter(&ump->um_lock); 1402 ACTIVECG_CLR(fs, cg); 1403 setbit(inosused, ipref); 1404 ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); 1405 fs->fs_cstotal.cs_nifree--; 1406 fs->fs_cs(fs, cg).cs_nifree--; 1407 fs->fs_fmod = 1; 1408 if ((mode & IFMT) == IFDIR) { 1409 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); 1410 fs->fs_cstotal.cs_ndir++; 1411 fs->fs_cs(fs, cg).cs_ndir++; 1412 } 1413 mutex_exit(&ump->um_lock); 1414 if (ibp != NULL) { 1415 bwrite(ibp); 1416 bwrite(bp); 1417 } else 1418 bdwrite(bp); 1419 return (cg * fs->fs_ipg + ipref); 1420 fail: 1421 if (bp != NULL) 1422 brelse(bp, 0); 1423 if (ibp != NULL) 1424 brelse(ibp, 0); 1425 mutex_enter(&ump->um_lock); 1426 return (0); 1427 } 1428 1429 /* 1430 * Allocate a block or fragment. 1431 * 1432 * The specified block or fragment is removed from the 1433 * free map, possibly fragmenting a block in the process. 1434 * 1435 * This implementation should mirror fs_blkfree 1436 * 1437 * => um_lock not held on entry or exit 1438 */ 1439 int 1440 ffs_blkalloc(struct inode *ip, daddr_t bno, long size) 1441 { 1442 int error; 1443 1444 error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, 1445 ip->i_dev, ip->i_uid); 1446 if (error) 1447 return error; 1448 1449 return ffs_blkalloc_ump(ip->i_ump, bno, size); 1450 } 1451 1452 int 1453 ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) 1454 { 1455 struct fs *fs = ump->um_fs; 1456 struct cg *cgp; 1457 struct buf *bp; 1458 int32_t fragno, cgbno; 1459 int i, error, cg, blk, frags, bbase; 1460 u_int8_t *blksfree; 1461 const int needswap = UFS_FSNEEDSWAP(fs); 1462 1463 KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 && 1464 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag); 1465 KASSERT(bno < fs->fs_size); 1466 1467 cg = dtog(fs, bno); 1468 error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1469 (int)fs->fs_cgsize, B_MODIFY, &bp); 1470 if (error) { 1471 return error; 1472 } 1473 cgp = (struct cg *)bp->b_data; 1474 if (!cg_chkmagic(cgp, needswap)) { 1475 brelse(bp, 0); 1476 return EIO; 1477 } 1478 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1479 cgp->cg_time = ufs_rw64(time_second, needswap); 1480 cgbno = dtogd(fs, bno); 1481 blksfree = cg_blksfree(cgp, needswap); 1482 1483 mutex_enter(&ump->um_lock); 1484 if (size == fs->fs_bsize) { 1485 fragno = ffs_fragstoblks(fs, cgbno); 1486 if (!ffs_isblock(fs, blksfree, fragno)) { 1487 mutex_exit(&ump->um_lock); 1488 brelse(bp, 0); 1489 return EBUSY; 1490 } 1491 ffs_clrblock(fs, blksfree, fragno); 1492 ffs_clusteracct(fs, cgp, fragno, -1); 1493 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1494 fs->fs_cstotal.cs_nbfree--; 1495 fs->fs_cs(fs, cg).cs_nbfree--; 1496 } else { 1497 bbase = cgbno - ffs_fragnum(fs, cgbno); 1498 1499 frags = ffs_numfrags(fs, size); 1500 for (i = 0; i < frags; i++) { 1501 if (isclr(blksfree, cgbno + i)) { 1502 mutex_exit(&ump->um_lock); 1503 brelse(bp, 0); 1504 return EBUSY; 1505 } 1506 } 1507 /* 1508 * if a complete block is being split, account for it 1509 */ 1510 fragno = ffs_fragstoblks(fs, bbase); 1511 if (ffs_isblock(fs, blksfree, fragno)) { 1512 ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); 1513 fs->fs_cstotal.cs_nffree += fs->fs_frag; 1514 fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; 1515 ffs_clusteracct(fs, cgp, fragno, -1); 1516 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1517 fs->fs_cstotal.cs_nbfree--; 1518 fs->fs_cs(fs, cg).cs_nbfree--; 1519 } 1520 /* 1521 * decrement the counts associated with the old frags 1522 */ 1523 blk = blkmap(fs, blksfree, bbase); 1524 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1525 /* 1526 * allocate the fragment 1527 */ 1528 for (i = 0; i < frags; i++) { 1529 clrbit(blksfree, cgbno + i); 1530 } 1531 ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); 1532 fs->fs_cstotal.cs_nffree -= i; 1533 fs->fs_cs(fs, cg).cs_nffree -= i; 1534 /* 1535 * add back in counts associated with the new frags 1536 */ 1537 blk = blkmap(fs, blksfree, bbase); 1538 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1539 } 1540 fs->fs_fmod = 1; 1541 ACTIVECG_CLR(fs, cg); 1542 mutex_exit(&ump->um_lock); 1543 bdwrite(bp); 1544 return 0; 1545 } 1546 1547 /* 1548 * Free a block or fragment. 1549 * 1550 * The specified block or fragment is placed back in the 1551 * free map. If a fragment is deallocated, a possible 1552 * block reassembly is checked. 1553 * 1554 * => um_lock not held on entry or exit 1555 */ 1556 static void 1557 ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size) 1558 { 1559 struct cg *cgp; 1560 struct buf *bp; 1561 struct ufsmount *ump; 1562 daddr_t cgblkno; 1563 int error, cg; 1564 dev_t dev; 1565 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1566 const int needswap = UFS_FSNEEDSWAP(fs); 1567 1568 KASSERT(!devvp_is_snapshot); 1569 1570 cg = dtog(fs, bno); 1571 dev = devvp->v_rdev; 1572 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1573 KASSERT(fs == ump->um_fs); 1574 cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1575 1576 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1577 B_MODIFY, &bp); 1578 if (error) { 1579 return; 1580 } 1581 cgp = (struct cg *)bp->b_data; 1582 if (!cg_chkmagic(cgp, needswap)) { 1583 brelse(bp, 0); 1584 return; 1585 } 1586 1587 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1588 1589 bdwrite(bp); 1590 } 1591 1592 struct discardopdata { 1593 struct work wk; /* must be first */ 1594 struct vnode *devvp; 1595 daddr_t bno; 1596 long size; 1597 }; 1598 1599 struct discarddata { 1600 struct fs *fs; 1601 struct discardopdata *entry; 1602 long maxsize; 1603 kmutex_t entrylk; 1604 struct workqueue *wq; 1605 int wqcnt, wqdraining; 1606 kmutex_t wqlk; 1607 kcondvar_t wqcv; 1608 /* timer for flush? */ 1609 }; 1610 1611 static void 1612 ffs_blkfree_td(struct fs *fs, struct discardopdata *td) 1613 { 1614 struct mount *mp = spec_node_getmountedfs(td->devvp); 1615 long todo; 1616 int error; 1617 1618 while (td->size) { 1619 todo = uimin(td->size, 1620 ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno)))); 1621 error = UFS_WAPBL_BEGIN(mp); 1622 if (error) { 1623 printf("ffs: failed to begin wapbl transaction" 1624 " for discard: %d\n", error); 1625 break; 1626 } 1627 ffs_blkfree_cg(fs, td->devvp, td->bno, todo); 1628 UFS_WAPBL_END(mp); 1629 td->bno += ffs_numfrags(fs, todo); 1630 td->size -= todo; 1631 } 1632 } 1633 1634 static void 1635 ffs_discardcb(struct work *wk, void *arg) 1636 { 1637 struct discardopdata *td = (void *)wk; 1638 struct discarddata *ts = arg; 1639 struct fs *fs = ts->fs; 1640 off_t start, len; 1641 #ifdef TRIMDEBUG 1642 int error; 1643 #endif 1644 1645 /* like FSBTODB but emits bytes; XXX move to fs.h */ 1646 #ifndef FFS_FSBTOBYTES 1647 #define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift) 1648 #endif 1649 1650 start = FFS_FSBTOBYTES(fs, td->bno); 1651 len = td->size; 1652 #ifdef TRIMDEBUG 1653 error = 1654 #endif 1655 VOP_FDISCARD(td->devvp, start, len); 1656 #ifdef TRIMDEBUG 1657 printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error); 1658 #endif 1659 1660 ffs_blkfree_td(fs, td); 1661 kmem_free(td, sizeof(*td)); 1662 mutex_enter(&ts->wqlk); 1663 ts->wqcnt--; 1664 if (ts->wqdraining && !ts->wqcnt) 1665 cv_signal(&ts->wqcv); 1666 mutex_exit(&ts->wqlk); 1667 } 1668 1669 void * 1670 ffs_discard_init(struct vnode *devvp, struct fs *fs) 1671 { 1672 struct discarddata *ts; 1673 int error; 1674 1675 ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); 1676 error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts, 1677 PRI_USER, IPL_NONE, 0); 1678 if (error) { 1679 kmem_free(ts, sizeof (*ts)); 1680 return NULL; 1681 } 1682 mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE); 1683 mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE); 1684 cv_init(&ts->wqcv, "trimwqcv"); 1685 ts->maxsize = 100*1024; /* XXX */ 1686 ts->fs = fs; 1687 return ts; 1688 } 1689 1690 void 1691 ffs_discard_finish(void *vts, int flags) 1692 { 1693 struct discarddata *ts = vts; 1694 struct discardopdata *td = NULL; 1695 1696 /* wait for workqueue to drain */ 1697 mutex_enter(&ts->wqlk); 1698 if (ts->wqcnt) { 1699 ts->wqdraining = 1; 1700 cv_wait(&ts->wqcv, &ts->wqlk); 1701 } 1702 mutex_exit(&ts->wqlk); 1703 1704 mutex_enter(&ts->entrylk); 1705 if (ts->entry) { 1706 td = ts->entry; 1707 ts->entry = NULL; 1708 } 1709 mutex_exit(&ts->entrylk); 1710 if (td) { 1711 /* XXX don't tell disk, its optional */ 1712 ffs_blkfree_td(ts->fs, td); 1713 #ifdef TRIMDEBUG 1714 printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size); 1715 #endif 1716 kmem_free(td, sizeof(*td)); 1717 } 1718 1719 cv_destroy(&ts->wqcv); 1720 mutex_destroy(&ts->entrylk); 1721 mutex_destroy(&ts->wqlk); 1722 workqueue_destroy(ts->wq); 1723 kmem_free(ts, sizeof(*ts)); 1724 } 1725 1726 void 1727 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1728 ino_t inum) 1729 { 1730 struct ufsmount *ump; 1731 int error; 1732 dev_t dev; 1733 struct discarddata *ts; 1734 struct discardopdata *td; 1735 1736 dev = devvp->v_rdev; 1737 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1738 if (ffs_snapblkfree(fs, devvp, bno, size, inum)) 1739 return; 1740 1741 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1742 if (error) 1743 return; 1744 1745 if (!ump->um_discarddata) { 1746 ffs_blkfree_cg(fs, devvp, bno, size); 1747 return; 1748 } 1749 1750 #ifdef TRIMDEBUG 1751 printf("blkfree(%" PRId64 ",%ld)\n", bno, size); 1752 #endif 1753 ts = ump->um_discarddata; 1754 td = NULL; 1755 1756 mutex_enter(&ts->entrylk); 1757 if (ts->entry) { 1758 td = ts->entry; 1759 /* ffs deallocs backwards, check for prepend only */ 1760 if (td->bno == bno + ffs_numfrags(fs, size) 1761 && td->size + size <= ts->maxsize) { 1762 td->bno = bno; 1763 td->size += size; 1764 if (td->size < ts->maxsize) { 1765 #ifdef TRIMDEBUG 1766 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1767 #endif 1768 mutex_exit(&ts->entrylk); 1769 return; 1770 } 1771 size = 0; /* mark done */ 1772 } 1773 ts->entry = NULL; 1774 } 1775 mutex_exit(&ts->entrylk); 1776 1777 if (td) { 1778 #ifdef TRIMDEBUG 1779 printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size); 1780 #endif 1781 mutex_enter(&ts->wqlk); 1782 ts->wqcnt++; 1783 mutex_exit(&ts->wqlk); 1784 workqueue_enqueue(ts->wq, &td->wk, NULL); 1785 } 1786 if (!size) 1787 return; 1788 1789 td = kmem_alloc(sizeof(*td), KM_SLEEP); 1790 td->devvp = devvp; 1791 td->bno = bno; 1792 td->size = size; 1793 1794 if (td->size < ts->maxsize) { /* XXX always the case */ 1795 mutex_enter(&ts->entrylk); 1796 if (!ts->entry) { /* possible race? */ 1797 #ifdef TRIMDEBUG 1798 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1799 #endif 1800 ts->entry = td; 1801 td = NULL; 1802 } 1803 mutex_exit(&ts->entrylk); 1804 } 1805 if (td) { 1806 #ifdef TRIMDEBUG 1807 printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size); 1808 #endif 1809 mutex_enter(&ts->wqlk); 1810 ts->wqcnt++; 1811 mutex_exit(&ts->wqlk); 1812 workqueue_enqueue(ts->wq, &td->wk, NULL); 1813 } 1814 } 1815 1816 /* 1817 * Free a block or fragment from a snapshot cg copy. 1818 * 1819 * The specified block or fragment is placed back in the 1820 * free map. If a fragment is deallocated, a possible 1821 * block reassembly is checked. 1822 * 1823 * => um_lock not held on entry or exit 1824 */ 1825 void 1826 ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1827 ino_t inum) 1828 { 1829 struct cg *cgp; 1830 struct buf *bp; 1831 struct ufsmount *ump; 1832 daddr_t cgblkno; 1833 int error, cg; 1834 dev_t dev; 1835 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1836 const int needswap = UFS_FSNEEDSWAP(fs); 1837 1838 KASSERT(devvp_is_snapshot); 1839 1840 cg = dtog(fs, bno); 1841 dev = VTOI(devvp)->i_devvp->v_rdev; 1842 ump = VFSTOUFS(devvp->v_mount); 1843 cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg)); 1844 1845 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1846 if (error) 1847 return; 1848 1849 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1850 B_MODIFY, &bp); 1851 if (error) { 1852 return; 1853 } 1854 cgp = (struct cg *)bp->b_data; 1855 if (!cg_chkmagic(cgp, needswap)) { 1856 brelse(bp, 0); 1857 return; 1858 } 1859 1860 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1861 1862 bdwrite(bp); 1863 } 1864 1865 static void 1866 ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 1867 struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) 1868 { 1869 struct cg *cgp; 1870 int32_t fragno, cgbno; 1871 int i, cg, blk, frags, bbase; 1872 u_int8_t *blksfree; 1873 const int needswap = UFS_FSNEEDSWAP(fs); 1874 1875 cg = dtog(fs, bno); 1876 cgp = (struct cg *)bp->b_data; 1877 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1878 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1879 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1880 cgp->cg_time = ufs_rw64(time_second, needswap); 1881 cgbno = dtogd(fs, bno); 1882 blksfree = cg_blksfree(cgp, needswap); 1883 mutex_enter(&ump->um_lock); 1884 if (size == fs->fs_bsize) { 1885 fragno = ffs_fragstoblks(fs, cgbno); 1886 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 1887 if (devvp_is_snapshot) { 1888 mutex_exit(&ump->um_lock); 1889 return; 1890 } 1891 panic("%s: freeing free block: dev = 0x%llx, block = %" 1892 PRId64 ", fs = %s", __func__, 1893 (unsigned long long)dev, bno, fs->fs_fsmnt); 1894 } 1895 ffs_setblock(fs, blksfree, fragno); 1896 ffs_clusteracct(fs, cgp, fragno, 1); 1897 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1898 fs->fs_cstotal.cs_nbfree++; 1899 fs->fs_cs(fs, cg).cs_nbfree++; 1900 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1901 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1902 i = old_cbtocylno(fs, cgbno); 1903 KASSERT(i >= 0); 1904 KASSERT(i < fs->fs_old_ncyl); 1905 KASSERT(old_cbtorpos(fs, cgbno) >= 0); 1906 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); 1907 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, 1908 needswap); 1909 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1910 } 1911 } else { 1912 bbase = cgbno - ffs_fragnum(fs, cgbno); 1913 /* 1914 * decrement the counts associated with the old frags 1915 */ 1916 blk = blkmap(fs, blksfree, bbase); 1917 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1918 /* 1919 * deallocate the fragment 1920 */ 1921 frags = ffs_numfrags(fs, size); 1922 for (i = 0; i < frags; i++) { 1923 if (isset(blksfree, cgbno + i)) { 1924 panic("%s: freeing free frag: " 1925 "dev = 0x%llx, block = %" PRId64 1926 ", fs = %s", __func__, 1927 (unsigned long long)dev, bno + i, 1928 fs->fs_fsmnt); 1929 } 1930 setbit(blksfree, cgbno + i); 1931 } 1932 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1933 fs->fs_cstotal.cs_nffree += i; 1934 fs->fs_cs(fs, cg).cs_nffree += i; 1935 /* 1936 * add back in counts associated with the new frags 1937 */ 1938 blk = blkmap(fs, blksfree, bbase); 1939 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1940 /* 1941 * if a complete block has been reassembled, account for it 1942 */ 1943 fragno = ffs_fragstoblks(fs, bbase); 1944 if (ffs_isblock(fs, blksfree, fragno)) { 1945 ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); 1946 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 1947 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 1948 ffs_clusteracct(fs, cgp, fragno, 1); 1949 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1950 fs->fs_cstotal.cs_nbfree++; 1951 fs->fs_cs(fs, cg).cs_nbfree++; 1952 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1953 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1954 i = old_cbtocylno(fs, bbase); 1955 KASSERT(i >= 0); 1956 KASSERT(i < fs->fs_old_ncyl); 1957 KASSERT(old_cbtorpos(fs, bbase) >= 0); 1958 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); 1959 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, 1960 bbase)], 1, needswap); 1961 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1962 } 1963 } 1964 } 1965 fs->fs_fmod = 1; 1966 ACTIVECG_CLR(fs, cg); 1967 mutex_exit(&ump->um_lock); 1968 } 1969 1970 /* 1971 * Free an inode. 1972 */ 1973 int 1974 ffs_vfree(struct vnode *vp, ino_t ino, int mode) 1975 { 1976 1977 return ffs_freefile(vp->v_mount, ino, mode); 1978 } 1979 1980 /* 1981 * Do the actual free operation. 1982 * The specified inode is placed back in the free map. 1983 * 1984 * => um_lock not held on entry or exit 1985 */ 1986 int 1987 ffs_freefile(struct mount *mp, ino_t ino, int mode) 1988 { 1989 struct ufsmount *ump = VFSTOUFS(mp); 1990 struct fs *fs = ump->um_fs; 1991 struct vnode *devvp; 1992 struct cg *cgp; 1993 struct buf *bp; 1994 int error, cg; 1995 daddr_t cgbno; 1996 dev_t dev; 1997 const int needswap = UFS_FSNEEDSWAP(fs); 1998 1999 cg = ino_to_cg(fs, ino); 2000 devvp = ump->um_devvp; 2001 dev = devvp->v_rdev; 2002 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2003 2004 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2005 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 2006 (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); 2007 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2008 B_MODIFY, &bp); 2009 if (error) { 2010 return (error); 2011 } 2012 cgp = (struct cg *)bp->b_data; 2013 if (!cg_chkmagic(cgp, needswap)) { 2014 brelse(bp, 0); 2015 return (0); 2016 } 2017 2018 ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); 2019 2020 bdwrite(bp); 2021 2022 return 0; 2023 } 2024 2025 int 2026 ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) 2027 { 2028 struct ufsmount *ump; 2029 struct cg *cgp; 2030 struct buf *bp; 2031 int error, cg; 2032 daddr_t cgbno; 2033 dev_t dev; 2034 const int needswap = UFS_FSNEEDSWAP(fs); 2035 2036 KASSERT(devvp->v_type != VBLK); 2037 2038 cg = ino_to_cg(fs, ino); 2039 dev = VTOI(devvp)->i_devvp->v_rdev; 2040 ump = VFSTOUFS(devvp->v_mount); 2041 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2042 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2043 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 2044 (unsigned long long)dev, (unsigned long long)ino, 2045 fs->fs_fsmnt); 2046 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2047 B_MODIFY, &bp); 2048 if (error) { 2049 return (error); 2050 } 2051 cgp = (struct cg *)bp->b_data; 2052 if (!cg_chkmagic(cgp, needswap)) { 2053 brelse(bp, 0); 2054 return (0); 2055 } 2056 ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); 2057 2058 bdwrite(bp); 2059 2060 return 0; 2061 } 2062 2063 static void 2064 ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 2065 struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) 2066 { 2067 int cg; 2068 struct cg *cgp; 2069 u_int8_t *inosused; 2070 const int needswap = UFS_FSNEEDSWAP(fs); 2071 2072 cg = ino_to_cg(fs, ino); 2073 cgp = (struct cg *)bp->b_data; 2074 cgp->cg_old_time = ufs_rw32(time_second, needswap); 2075 if ((fs->fs_magic != FS_UFS1_MAGIC) || 2076 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 2077 cgp->cg_time = ufs_rw64(time_second, needswap); 2078 inosused = cg_inosused(cgp, needswap); 2079 ino %= fs->fs_ipg; 2080 if (isclr(inosused, ino)) { 2081 printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", 2082 (unsigned long long)dev, (unsigned long long)ino + 2083 cg * fs->fs_ipg, fs->fs_fsmnt); 2084 if (fs->fs_ronly == 0) 2085 panic("%s: freeing free inode", __func__); 2086 } 2087 clrbit(inosused, ino); 2088 if (!devvp_is_snapshot) 2089 UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, 2090 ino + cg * fs->fs_ipg, mode); 2091 if (ino < ufs_rw32(cgp->cg_irotor, needswap)) 2092 cgp->cg_irotor = ufs_rw32(ino, needswap); 2093 ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); 2094 mutex_enter(&ump->um_lock); 2095 fs->fs_cstotal.cs_nifree++; 2096 fs->fs_cs(fs, cg).cs_nifree++; 2097 if ((mode & IFMT) == IFDIR) { 2098 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); 2099 fs->fs_cstotal.cs_ndir--; 2100 fs->fs_cs(fs, cg).cs_ndir--; 2101 } 2102 fs->fs_fmod = 1; 2103 ACTIVECG_CLR(fs, cg); 2104 mutex_exit(&ump->um_lock); 2105 } 2106 2107 /* 2108 * Check to see if a file is free. 2109 */ 2110 int 2111 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) 2112 { 2113 struct cg *cgp; 2114 struct buf *bp; 2115 daddr_t cgbno; 2116 int ret, cg; 2117 u_int8_t *inosused; 2118 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 2119 2120 KASSERT(devvp_is_snapshot); 2121 2122 cg = ino_to_cg(fs, ino); 2123 if (devvp_is_snapshot) 2124 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2125 else 2126 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2127 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2128 return 1; 2129 if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) { 2130 return 1; 2131 } 2132 cgp = (struct cg *)bp->b_data; 2133 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { 2134 brelse(bp, 0); 2135 return 1; 2136 } 2137 inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); 2138 ino %= fs->fs_ipg; 2139 ret = isclr(inosused, ino); 2140 brelse(bp, 0); 2141 return ret; 2142 } 2143 2144 /* 2145 * Find a block of the specified size in the specified cylinder group. 2146 * 2147 * It is a panic if a request is made to find a block if none are 2148 * available. 2149 */ 2150 static int32_t 2151 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) 2152 { 2153 int32_t bno; 2154 int start, len, loc, i; 2155 int blk, field, subfield, pos; 2156 int ostart, olen; 2157 u_int8_t *blksfree; 2158 const int needswap = UFS_FSNEEDSWAP(fs); 2159 2160 /* KASSERT(mutex_owned(&ump->um_lock)); */ 2161 2162 /* 2163 * find the fragment by searching through the free block 2164 * map for an appropriate bit pattern 2165 */ 2166 if (bpref) 2167 start = dtogd(fs, bpref) / NBBY; 2168 else 2169 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; 2170 blksfree = cg_blksfree(cgp, needswap); 2171 len = howmany(fs->fs_fpg, NBBY) - start; 2172 ostart = start; 2173 olen = len; 2174 loc = scanc((u_int)len, 2175 (const u_char *)&blksfree[start], 2176 (const u_char *)fragtbl[fs->fs_frag], 2177 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2178 if (loc == 0) { 2179 len = start + 1; 2180 start = 0; 2181 loc = scanc((u_int)len, 2182 (const u_char *)&blksfree[0], 2183 (const u_char *)fragtbl[fs->fs_frag], 2184 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2185 if (loc == 0) { 2186 panic("%s: map corrupted: start=%d, len=%d, " 2187 "fs = %s, offset=%d/%ld, cg %d", __func__, 2188 ostart, olen, fs->fs_fsmnt, 2189 ufs_rw32(cgp->cg_freeoff, needswap), 2190 (long)blksfree - (long)cgp, cgp->cg_cgx); 2191 /* NOTREACHED */ 2192 } 2193 } 2194 bno = (start + len - loc) * NBBY; 2195 cgp->cg_frotor = ufs_rw32(bno, needswap); 2196 /* 2197 * found the byte in the map 2198 * sift through the bits to find the selected frag 2199 */ 2200 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2201 blk = blkmap(fs, blksfree, bno); 2202 blk <<= 1; 2203 field = around[allocsiz]; 2204 subfield = inside[allocsiz]; 2205 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2206 if ((blk & field) == subfield) 2207 return (bno + pos); 2208 field <<= 1; 2209 subfield <<= 1; 2210 } 2211 } 2212 panic("%s: block not in map: bno=%d, fs=%s", __func__, 2213 bno, fs->fs_fsmnt); 2214 /* return (-1); */ 2215 } 2216 2217 /* 2218 * Fserr prints the name of a file system with an error diagnostic. 2219 * 2220 * The form of the error message is: 2221 * fs: error message 2222 */ 2223 static void 2224 ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp) 2225 { 2226 KASSERT(cred != NULL); 2227 2228 if (cred == NOCRED || cred == FSCRED) { 2229 log(LOG_ERR, "pid %d, command %s, on %s: %s\n", 2230 curproc->p_pid, curproc->p_comm, 2231 fs->fs_fsmnt, cp); 2232 } else { 2233 log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", 2234 kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm, 2235 fs->fs_fsmnt, cp); 2236 } 2237 } 2238