1 /* $NetBSD: ffs_alloc.c,v 1.159 2017/12/07 21:53:41 chs Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2002 Networks Associates Technology, Inc. 34 * All rights reserved. 35 * 36 * This software was developed for the FreeBSD Project by Marshall 37 * Kirk McKusick and Network Associates Laboratories, the Security 38 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 39 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 40 * research program 41 * 42 * Copyright (c) 1982, 1986, 1989, 1993 43 * The Regents of the University of California. All rights reserved. 44 * 45 * Redistribution and use in source and binary forms, with or without 46 * modification, are permitted provided that the following conditions 47 * are met: 48 * 1. Redistributions of source code must retain the above copyright 49 * notice, this list of conditions and the following disclaimer. 50 * 2. Redistributions in binary form must reproduce the above copyright 51 * notice, this list of conditions and the following disclaimer in the 52 * documentation and/or other materials provided with the distribution. 53 * 3. Neither the name of the University nor the names of its contributors 54 * may be used to endorse or promote products derived from this software 55 * without specific prior written permission. 56 * 57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * SUCH DAMAGE. 68 * 69 * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.159 2017/12/07 21:53:41 chs Exp $"); 74 75 #if defined(_KERNEL_OPT) 76 #include "opt_ffs.h" 77 #include "opt_quota.h" 78 #include "opt_uvm_page_trkown.h" 79 #endif 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/buf.h> 84 #include <sys/cprng.h> 85 #include <sys/kauth.h> 86 #include <sys/kernel.h> 87 #include <sys/mount.h> 88 #include <sys/proc.h> 89 #include <sys/syslog.h> 90 #include <sys/vnode.h> 91 #include <sys/wapbl.h> 92 #include <sys/cprng.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/ufsmount.h> 97 #include <ufs/ufs/inode.h> 98 #include <ufs/ufs/ufs_extern.h> 99 #include <ufs/ufs/ufs_bswap.h> 100 #include <ufs/ufs/ufs_wapbl.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 105 #ifdef UVM_PAGE_TRKOWN 106 #include <uvm/uvm.h> 107 #endif 108 109 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int, int); 110 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int); 111 static ino_t ffs_dirpref(struct inode *); 112 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); 113 static void ffs_fserr(struct fs *, kauth_cred_t, const char *); 114 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, int, 115 daddr_t (*)(struct inode *, int, daddr_t, int, int, int)); 116 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int, int); 117 static int32_t ffs_mapsearch(struct fs *, struct cg *, 118 daddr_t, int); 119 static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, 120 daddr_t, long, bool); 121 static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, 122 int, bool); 123 124 /* if 1, changes in optimalization strategy are logged */ 125 int ffs_log_changeopt = 0; 126 127 /* in ffs_tables.c */ 128 extern const int inside[], around[]; 129 extern const u_char * const fragtbl[]; 130 131 /* Basic consistency check for block allocations */ 132 static int 133 ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, 134 long size, dev_t dev, ino_t inum) 135 { 136 if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 || 137 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) { 138 panic("%s: bad size: dev = 0x%llx, bno = %" PRId64 139 " bsize = %d, size = %ld, fs = %s", func, 140 (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); 141 } 142 143 if (bno >= fs->fs_size) { 144 printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno, 145 (unsigned long long)inum); 146 ffs_fserr(fs, NOCRED, "bad block"); 147 return EINVAL; 148 } 149 return 0; 150 } 151 152 /* 153 * Allocate a block in the file system. 154 * 155 * The size of the requested block is given, which must be some 156 * multiple of fs_fsize and <= fs_bsize. 157 * A preference may be optionally specified. If a preference is given 158 * the following hierarchy is used to allocate a block: 159 * 1) allocate the requested block. 160 * 2) allocate a rotationally optimal block in the same cylinder. 161 * 3) allocate a block in the same cylinder group. 162 * 4) quadradically rehash into other cylinder groups, until an 163 * available block is located. 164 * If no block preference is given the following hierarchy is used 165 * to allocate a block: 166 * 1) allocate a block in the cylinder group that contains the 167 * inode for the file. 168 * 2) quadradically rehash into other cylinder groups, until an 169 * available block is located. 170 * 171 * => called with um_lock held 172 * => releases um_lock before returning 173 */ 174 int 175 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, 176 int flags, kauth_cred_t cred, daddr_t *bnp) 177 { 178 struct ufsmount *ump; 179 struct fs *fs; 180 daddr_t bno; 181 int cg; 182 #if defined(QUOTA) || defined(QUOTA2) 183 int error; 184 #endif 185 186 fs = ip->i_fs; 187 ump = ip->i_ump; 188 189 KASSERT(mutex_owned(&ump->um_lock)); 190 191 #ifdef UVM_PAGE_TRKOWN 192 193 /* 194 * Sanity-check that allocations within the file size 195 * do not allow other threads to read the stale contents 196 * of newly allocated blocks. 197 * Usually pages will exist to cover the new allocation. 198 * There is an optimization in ffs_write() where we skip 199 * creating pages if several conditions are met: 200 * - the file must not be mapped (in any user address space). 201 * - the write must cover whole pages and whole blocks. 202 * If those conditions are not met then pages must exist and 203 * be locked by the current thread. 204 */ 205 206 struct vnode *vp = ITOV(ip); 207 if (vp->v_type == VREG && 208 ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) && 209 ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 || 210 ffs_blkoff(fs, size) != 0)) { 211 struct vm_page *pg; 212 struct uvm_object *uobj = &vp->v_uobj; 213 voff_t off = trunc_page(ffs_lblktosize(fs, lbn)); 214 voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size); 215 216 mutex_enter(uobj->vmobjlock); 217 while (off < endoff) { 218 pg = uvm_pagelookup(uobj, off); 219 KASSERT((pg != NULL && pg->owner_tag != NULL && 220 pg->owner == curproc->p_pid && 221 pg->lowner == curlwp->l_lid)); 222 off += PAGE_SIZE; 223 } 224 mutex_exit(uobj->vmobjlock); 225 } 226 #endif 227 228 *bnp = 0; 229 230 KASSERTMSG((cred != NOCRED), "missing credential"); 231 KASSERTMSG(((u_int)size <= fs->fs_bsize), 232 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 233 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 234 KASSERTMSG((ffs_fragoff(fs, size) == 0), 235 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 236 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 237 238 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 239 goto nospace; 240 if (freespace(fs, fs->fs_minfree) <= 0 && 241 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 242 NULL, NULL) != 0) 243 goto nospace; 244 #if defined(QUOTA) || defined(QUOTA2) 245 mutex_exit(&ump->um_lock); 246 if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) 247 return (error); 248 mutex_enter(&ump->um_lock); 249 #endif 250 251 if (bpref >= fs->fs_size) 252 bpref = 0; 253 if (bpref == 0) 254 cg = ino_to_cg(fs, ip->i_number); 255 else 256 cg = dtog(fs, bpref); 257 bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg); 258 if (bno > 0) { 259 DIP_ADD(ip, blocks, btodb(size)); 260 ip->i_flag |= IN_CHANGE | IN_UPDATE; 261 *bnp = bno; 262 return (0); 263 } 264 #if defined(QUOTA) || defined(QUOTA2) 265 /* 266 * Restore user's disk quota because allocation failed. 267 */ 268 (void) chkdq(ip, -btodb(size), cred, FORCE); 269 #endif 270 if (flags & B_CONTIG) { 271 /* 272 * XXX ump->um_lock handling is "suspect" at best. 273 * For the case where ffs_hashalloc() fails early 274 * in the B_CONTIG case we reach here with um_lock 275 * already unlocked, so we can't release it again 276 * like in the normal error path. See kern/39206. 277 * 278 * 279 * Fail silently - it's up to our caller to report 280 * errors. 281 */ 282 return (ENOSPC); 283 } 284 nospace: 285 mutex_exit(&ump->um_lock); 286 ffs_fserr(fs, cred, "file system full"); 287 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 288 return (ENOSPC); 289 } 290 291 /* 292 * Reallocate a fragment to a bigger size 293 * 294 * The number and size of the old block is given, and a preference 295 * and new size is also specified. The allocator attempts to extend 296 * the original block. Failing that, the regular block allocator is 297 * invoked to get an appropriate block. 298 * 299 * => called with um_lock held 300 * => return with um_lock released 301 */ 302 int 303 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize, 304 int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop) 305 { 306 struct ufsmount *ump; 307 struct fs *fs; 308 struct buf *bp; 309 int cg, request, error; 310 daddr_t bprev, bno; 311 312 fs = ip->i_fs; 313 ump = ip->i_ump; 314 315 KASSERT(mutex_owned(&ump->um_lock)); 316 317 #ifdef UVM_PAGE_TRKOWN 318 319 /* 320 * Sanity-check that allocations within the file size 321 * do not allow other threads to read the stale contents 322 * of newly allocated blocks. 323 * Unlike in ffs_alloc(), here pages must always exist 324 * for such allocations, because only the last block of a file 325 * can be a fragment and ffs_write() will reallocate the 326 * fragment to the new size using ufs_balloc_range(), 327 * which always creates pages to cover blocks it allocates. 328 */ 329 330 if (ITOV(ip)->v_type == VREG) { 331 struct vm_page *pg; 332 struct uvm_object *uobj = &ITOV(ip)->v_uobj; 333 voff_t off = trunc_page(ffs_lblktosize(fs, lbprev)); 334 voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize); 335 336 mutex_enter(uobj->vmobjlock); 337 while (off < endoff) { 338 pg = uvm_pagelookup(uobj, off); 339 KASSERT(pg->owner == curproc->p_pid && 340 pg->lowner == curlwp->l_lid); 341 off += PAGE_SIZE; 342 } 343 mutex_exit(uobj->vmobjlock); 344 } 345 #endif 346 347 KASSERTMSG((cred != NOCRED), "missing credential"); 348 KASSERTMSG(((u_int)osize <= fs->fs_bsize), 349 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 350 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 351 fs->fs_fsmnt); 352 KASSERTMSG((ffs_fragoff(fs, osize) == 0), 353 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 354 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 355 fs->fs_fsmnt); 356 KASSERTMSG(((u_int)nsize <= fs->fs_bsize), 357 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 358 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 359 fs->fs_fsmnt); 360 KASSERTMSG((ffs_fragoff(fs, nsize) == 0), 361 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 362 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 363 fs->fs_fsmnt); 364 365 if (freespace(fs, fs->fs_minfree) <= 0 && 366 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 367 NULL, NULL) != 0) { 368 mutex_exit(&ump->um_lock); 369 goto nospace; 370 } 371 if (fs->fs_magic == FS_UFS2_MAGIC) 372 bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs)); 373 else 374 bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs)); 375 376 if (bprev == 0) { 377 panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %" 378 PRId64 ", fs = %s", __func__, 379 (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, 380 fs->fs_fsmnt); 381 } 382 mutex_exit(&ump->um_lock); 383 384 /* 385 * Allocate the extra space in the buffer. 386 */ 387 if (bpp != NULL && 388 (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) { 389 return (error); 390 } 391 #if defined(QUOTA) || defined(QUOTA2) 392 if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { 393 if (bpp != NULL) { 394 brelse(bp, 0); 395 } 396 return (error); 397 } 398 #endif 399 /* 400 * Check for extension in the existing location. 401 */ 402 cg = dtog(fs, bprev); 403 mutex_enter(&ump->um_lock); 404 if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { 405 DIP_ADD(ip, blocks, btodb(nsize - osize)); 406 ip->i_flag |= IN_CHANGE | IN_UPDATE; 407 408 if (bpp != NULL) { 409 if (bp->b_blkno != FFS_FSBTODB(fs, bno)) { 410 panic("%s: bad blockno %#llx != %#llx", 411 __func__, (unsigned long long) bp->b_blkno, 412 (unsigned long long)FFS_FSBTODB(fs, bno)); 413 } 414 allocbuf(bp, nsize, 1); 415 memset((char *)bp->b_data + osize, 0, nsize - osize); 416 mutex_enter(bp->b_objlock); 417 KASSERT(!cv_has_waiters(&bp->b_done)); 418 bp->b_oflags |= BO_DONE; 419 mutex_exit(bp->b_objlock); 420 *bpp = bp; 421 } 422 if (blknop != NULL) { 423 *blknop = bno; 424 } 425 return (0); 426 } 427 /* 428 * Allocate a new disk location. 429 */ 430 if (bpref >= fs->fs_size) 431 bpref = 0; 432 switch ((int)fs->fs_optim) { 433 case FS_OPTSPACE: 434 /* 435 * Allocate an exact sized fragment. Although this makes 436 * best use of space, we will waste time relocating it if 437 * the file continues to grow. If the fragmentation is 438 * less than half of the minimum free reserve, we choose 439 * to begin optimizing for time. 440 */ 441 request = nsize; 442 if (fs->fs_minfree < 5 || 443 fs->fs_cstotal.cs_nffree > 444 fs->fs_dsize * fs->fs_minfree / (2 * 100)) 445 break; 446 447 if (ffs_log_changeopt) { 448 log(LOG_NOTICE, 449 "%s: optimization changed from SPACE to TIME\n", 450 fs->fs_fsmnt); 451 } 452 453 fs->fs_optim = FS_OPTTIME; 454 break; 455 case FS_OPTTIME: 456 /* 457 * At this point we have discovered a file that is trying to 458 * grow a small fragment to a larger fragment. To save time, 459 * we allocate a full sized block, then free the unused portion. 460 * If the file continues to grow, the `ffs_fragextend' call 461 * above will be able to grow it in place without further 462 * copying. If aberrant programs cause disk fragmentation to 463 * grow within 2% of the free reserve, we choose to begin 464 * optimizing for space. 465 */ 466 request = fs->fs_bsize; 467 if (fs->fs_cstotal.cs_nffree < 468 fs->fs_dsize * (fs->fs_minfree - 2) / 100) 469 break; 470 471 if (ffs_log_changeopt) { 472 log(LOG_NOTICE, 473 "%s: optimization changed from TIME to SPACE\n", 474 fs->fs_fsmnt); 475 } 476 477 fs->fs_optim = FS_OPTSPACE; 478 break; 479 default: 480 panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s", 481 __func__, (unsigned long long)ip->i_dev, fs->fs_optim, 482 fs->fs_fsmnt); 483 /* NOTREACHED */ 484 } 485 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); 486 if (bno > 0) { 487 /* 488 * Use forced deallocation registration, we can't handle 489 * failure here. This is safe, as this place is ever hit 490 * maximum once per write operation, when fragment is extended 491 * to longer fragment, or a full block. 492 */ 493 if ((ip->i_ump->um_mountp->mnt_wapbl) && 494 (ITOV(ip)->v_type != VREG)) { 495 /* this should never fail */ 496 error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE( 497 ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev), 498 osize); 499 if (error) 500 panic("ffs_realloccg: dealloc registration failed"); 501 } else { 502 ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, 503 ip->i_number); 504 } 505 DIP_ADD(ip, blocks, btodb(nsize - osize)); 506 ip->i_flag |= IN_CHANGE | IN_UPDATE; 507 if (bpp != NULL) { 508 bp->b_blkno = FFS_FSBTODB(fs, bno); 509 allocbuf(bp, nsize, 1); 510 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); 511 mutex_enter(bp->b_objlock); 512 KASSERT(!cv_has_waiters(&bp->b_done)); 513 bp->b_oflags |= BO_DONE; 514 mutex_exit(bp->b_objlock); 515 *bpp = bp; 516 } 517 if (blknop != NULL) { 518 *blknop = bno; 519 } 520 return (0); 521 } 522 mutex_exit(&ump->um_lock); 523 524 #if defined(QUOTA) || defined(QUOTA2) 525 /* 526 * Restore user's disk quota because allocation failed. 527 */ 528 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 529 #endif 530 if (bpp != NULL) { 531 brelse(bp, 0); 532 } 533 534 nospace: 535 /* 536 * no space available 537 */ 538 ffs_fserr(fs, cred, "file system full"); 539 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 540 return (ENOSPC); 541 } 542 543 /* 544 * Allocate an inode in the file system. 545 * 546 * If allocating a directory, use ffs_dirpref to select the inode. 547 * If allocating in a directory, the following hierarchy is followed: 548 * 1) allocate the preferred inode. 549 * 2) allocate an inode in the same cylinder group. 550 * 3) quadradically rehash into other cylinder groups, until an 551 * available inode is located. 552 * If no inode preference is given the following hierarchy is used 553 * to allocate an inode: 554 * 1) allocate an inode in cylinder group 0. 555 * 2) quadradically rehash into other cylinder groups, until an 556 * available inode is located. 557 * 558 * => um_lock not held upon entry or return 559 */ 560 int 561 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop) 562 { 563 struct ufsmount *ump; 564 struct inode *pip; 565 struct fs *fs; 566 ino_t ino, ipref; 567 int cg, error; 568 569 UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); 570 571 pip = VTOI(pvp); 572 fs = pip->i_fs; 573 ump = pip->i_ump; 574 575 error = UFS_WAPBL_BEGIN(pvp->v_mount); 576 if (error) { 577 return error; 578 } 579 mutex_enter(&ump->um_lock); 580 if (fs->fs_cstotal.cs_nifree == 0) 581 goto noinodes; 582 583 if ((mode & IFMT) == IFDIR) 584 ipref = ffs_dirpref(pip); 585 else 586 ipref = pip->i_number; 587 if (ipref >= fs->fs_ncg * fs->fs_ipg) 588 ipref = 0; 589 cg = ino_to_cg(fs, ipref); 590 /* 591 * Track number of dirs created one after another 592 * in a same cg without intervening by files. 593 */ 594 if ((mode & IFMT) == IFDIR) { 595 if (fs->fs_contigdirs[cg] < 255) 596 fs->fs_contigdirs[cg]++; 597 } else { 598 if (fs->fs_contigdirs[cg] > 0) 599 fs->fs_contigdirs[cg]--; 600 } 601 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg); 602 if (ino == 0) 603 goto noinodes; 604 UFS_WAPBL_END(pvp->v_mount); 605 *inop = ino; 606 return 0; 607 608 noinodes: 609 mutex_exit(&ump->um_lock); 610 UFS_WAPBL_END(pvp->v_mount); 611 ffs_fserr(fs, cred, "out of inodes"); 612 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); 613 return ENOSPC; 614 } 615 616 /* 617 * Find a cylinder group in which to place a directory. 618 * 619 * The policy implemented by this algorithm is to allocate a 620 * directory inode in the same cylinder group as its parent 621 * directory, but also to reserve space for its files inodes 622 * and data. Restrict the number of directories which may be 623 * allocated one after another in the same cylinder group 624 * without intervening allocation of files. 625 * 626 * If we allocate a first level directory then force allocation 627 * in another cylinder group. 628 */ 629 static ino_t 630 ffs_dirpref(struct inode *pip) 631 { 632 register struct fs *fs; 633 int cg, prefcg; 634 int64_t dirsize, cgsize, curdsz; 635 int avgifree, avgbfree, avgndir; 636 int minifree, minbfree, maxndir; 637 int mincg, minndir; 638 int maxcontigdirs; 639 640 KASSERT(mutex_owned(&pip->i_ump->um_lock)); 641 642 fs = pip->i_fs; 643 644 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 645 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 646 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 647 648 /* 649 * Force allocation in another cg if creating a first level dir. 650 */ 651 if (ITOV(pip)->v_vflag & VV_ROOT) { 652 prefcg = cprng_fast32() % fs->fs_ncg; 653 mincg = prefcg; 654 minndir = fs->fs_ipg; 655 for (cg = prefcg; cg < fs->fs_ncg; cg++) 656 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 657 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 658 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 659 mincg = cg; 660 minndir = fs->fs_cs(fs, cg).cs_ndir; 661 } 662 for (cg = 0; cg < prefcg; cg++) 663 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 664 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 665 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 666 mincg = cg; 667 minndir = fs->fs_cs(fs, cg).cs_ndir; 668 } 669 return ((ino_t)(fs->fs_ipg * mincg)); 670 } 671 672 /* 673 * Count various limits which used for 674 * optimal allocation of a directory inode. 675 * Try cylinder groups with >75% avgifree and avgbfree. 676 * Avoid cylinder groups with no free blocks or inodes as that 677 * triggers an I/O-expensive cylinder group scan. 678 */ 679 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 680 minifree = avgifree - avgifree / 4; 681 if (minifree < 1) 682 minifree = 1; 683 minbfree = avgbfree - avgbfree / 4; 684 if (minbfree < 1) 685 minbfree = 1; 686 cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; 687 dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; 688 if (avgndir != 0) { 689 curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; 690 if (dirsize < curdsz) 691 dirsize = curdsz; 692 } 693 if (cgsize < dirsize * 255) 694 maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize; 695 else 696 maxcontigdirs = 255; 697 if (fs->fs_avgfpdir > 0) 698 maxcontigdirs = min(maxcontigdirs, 699 fs->fs_ipg / fs->fs_avgfpdir); 700 if (maxcontigdirs == 0) 701 maxcontigdirs = 1; 702 703 /* 704 * Limit number of dirs in one cg and reserve space for 705 * regular files, but only if we have no deficit in 706 * inodes or space. 707 */ 708 prefcg = ino_to_cg(fs, pip->i_number); 709 for (cg = prefcg; cg < fs->fs_ncg; cg++) 710 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 711 fs->fs_cs(fs, cg).cs_nifree >= minifree && 712 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 713 if (fs->fs_contigdirs[cg] < maxcontigdirs) 714 return ((ino_t)(fs->fs_ipg * cg)); 715 } 716 for (cg = 0; cg < prefcg; cg++) 717 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 718 fs->fs_cs(fs, cg).cs_nifree >= minifree && 719 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 720 if (fs->fs_contigdirs[cg] < maxcontigdirs) 721 return ((ino_t)(fs->fs_ipg * cg)); 722 } 723 /* 724 * This is a backstop when we are deficient in space. 725 */ 726 for (cg = prefcg; cg < fs->fs_ncg; cg++) 727 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 728 return ((ino_t)(fs->fs_ipg * cg)); 729 for (cg = 0; cg < prefcg; cg++) 730 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 731 break; 732 return ((ino_t)(fs->fs_ipg * cg)); 733 } 734 735 /* 736 * Select the desired position for the next block in a file. The file is 737 * logically divided into sections. The first section is composed of the 738 * direct blocks. Each additional section contains fs_maxbpg blocks. 739 * 740 * If no blocks have been allocated in the first section, the policy is to 741 * request a block in the same cylinder group as the inode that describes 742 * the file. If no blocks have been allocated in any other section, the 743 * policy is to place the section in a cylinder group with a greater than 744 * average number of free blocks. An appropriate cylinder group is found 745 * by using a rotor that sweeps the cylinder groups. When a new group of 746 * blocks is needed, the sweep begins in the cylinder group following the 747 * cylinder group from which the previous allocation was made. The sweep 748 * continues until a cylinder group with greater than the average number 749 * of free blocks is found. If the allocation is for the first block in an 750 * indirect block, the information on the previous allocation is unavailable; 751 * here a best guess is made based upon the logical block number being 752 * allocated. 753 * 754 * If a section is already partially allocated, the policy is to 755 * contiguously allocate fs_maxcontig blocks. The end of one of these 756 * contiguous blocks and the beginning of the next is laid out 757 * contigously if possible. 758 * 759 * => um_lock held on entry and exit 760 */ 761 daddr_t 762 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, 763 int32_t *bap /* XXX ondisk32 */) 764 { 765 struct fs *fs; 766 int cg; 767 int avgbfree, startcg; 768 769 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 770 771 fs = ip->i_fs; 772 773 /* 774 * If allocating a contiguous file with B_CONTIG, use the hints 775 * in the inode extentions to return the desired block. 776 * 777 * For metadata (indirect blocks) return the address of where 778 * the first indirect block resides - we'll scan for the next 779 * available slot if we need to allocate more than one indirect 780 * block. For data, return the address of the actual block 781 * relative to the address of the first data block. 782 */ 783 if (flags & B_CONTIG) { 784 KASSERT(ip->i_ffs_first_data_blk != 0); 785 KASSERT(ip->i_ffs_first_indir_blk != 0); 786 if (flags & B_METAONLY) 787 return ip->i_ffs_first_indir_blk; 788 else 789 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 790 } 791 792 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 793 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 794 cg = ino_to_cg(fs, ip->i_number); 795 return (cgbase(fs, cg) + fs->fs_frag); 796 } 797 /* 798 * Find a cylinder with greater than average number of 799 * unused data blocks. 800 */ 801 if (indx == 0 || bap[indx - 1] == 0) 802 startcg = 803 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 804 else 805 startcg = dtog(fs, 806 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 807 startcg %= fs->fs_ncg; 808 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 809 for (cg = startcg; cg < fs->fs_ncg; cg++) 810 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 811 return (cgbase(fs, cg) + fs->fs_frag); 812 } 813 for (cg = 0; cg < startcg; cg++) 814 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 815 return (cgbase(fs, cg) + fs->fs_frag); 816 } 817 return (0); 818 } 819 /* 820 * We just always try to lay things out contiguously. 821 */ 822 return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 823 } 824 825 daddr_t 826 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, 827 int64_t *bap) 828 { 829 struct fs *fs; 830 int cg; 831 int avgbfree, startcg; 832 833 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 834 835 fs = ip->i_fs; 836 837 /* 838 * If allocating a contiguous file with B_CONTIG, use the hints 839 * in the inode extentions to return the desired block. 840 * 841 * For metadata (indirect blocks) return the address of where 842 * the first indirect block resides - we'll scan for the next 843 * available slot if we need to allocate more than one indirect 844 * block. For data, return the address of the actual block 845 * relative to the address of the first data block. 846 */ 847 if (flags & B_CONTIG) { 848 KASSERT(ip->i_ffs_first_data_blk != 0); 849 KASSERT(ip->i_ffs_first_indir_blk != 0); 850 if (flags & B_METAONLY) 851 return ip->i_ffs_first_indir_blk; 852 else 853 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 854 } 855 856 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 857 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 858 cg = ino_to_cg(fs, ip->i_number); 859 return (cgbase(fs, cg) + fs->fs_frag); 860 } 861 /* 862 * Find a cylinder with greater than average number of 863 * unused data blocks. 864 */ 865 if (indx == 0 || bap[indx - 1] == 0) 866 startcg = 867 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 868 else 869 startcg = dtog(fs, 870 ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 871 startcg %= fs->fs_ncg; 872 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 873 for (cg = startcg; cg < fs->fs_ncg; cg++) 874 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 875 return (cgbase(fs, cg) + fs->fs_frag); 876 } 877 for (cg = 0; cg < startcg; cg++) 878 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 879 return (cgbase(fs, cg) + fs->fs_frag); 880 } 881 return (0); 882 } 883 /* 884 * We just always try to lay things out contiguously. 885 */ 886 return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 887 } 888 889 890 /* 891 * Implement the cylinder overflow algorithm. 892 * 893 * The policy implemented by this algorithm is: 894 * 1) allocate the block in its requested cylinder group. 895 * 2) quadradically rehash on the cylinder group number. 896 * 3) brute force search for a free block. 897 * 898 * => called with um_lock held 899 * => returns with um_lock released on success, held on failure 900 * (*allocator releases lock on success, retains lock on failure) 901 */ 902 /*VARARGS5*/ 903 static daddr_t 904 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, 905 int size /* size for data blocks, mode for inodes */, 906 int realsize, 907 int flags, 908 daddr_t (*allocator)(struct inode *, int, daddr_t, int, int, int)) 909 { 910 struct fs *fs; 911 daddr_t result; 912 int i, icg = cg; 913 914 fs = ip->i_fs; 915 /* 916 * 1: preferred cylinder group 917 */ 918 result = (*allocator)(ip, cg, pref, size, realsize, flags); 919 if (result) 920 return (result); 921 922 if (flags & B_CONTIG) 923 return (result); 924 /* 925 * 2: quadratic rehash 926 */ 927 for (i = 1; i < fs->fs_ncg; i *= 2) { 928 cg += i; 929 if (cg >= fs->fs_ncg) 930 cg -= fs->fs_ncg; 931 result = (*allocator)(ip, cg, 0, size, realsize, flags); 932 if (result) 933 return (result); 934 } 935 /* 936 * 3: brute force search 937 * Note that we start at i == 2, since 0 was checked initially, 938 * and 1 is always checked in the quadratic rehash. 939 */ 940 cg = (icg + 2) % fs->fs_ncg; 941 for (i = 2; i < fs->fs_ncg; i++) { 942 result = (*allocator)(ip, cg, 0, size, realsize, flags); 943 if (result) 944 return (result); 945 cg++; 946 if (cg == fs->fs_ncg) 947 cg = 0; 948 } 949 return (0); 950 } 951 952 /* 953 * Determine whether a fragment can be extended. 954 * 955 * Check to see if the necessary fragments are available, and 956 * if they are, allocate them. 957 * 958 * => called with um_lock held 959 * => returns with um_lock released on success, held on failure 960 */ 961 static daddr_t 962 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize) 963 { 964 struct ufsmount *ump; 965 struct fs *fs; 966 struct cg *cgp; 967 struct buf *bp; 968 daddr_t bno; 969 int frags, bbase; 970 int i, error; 971 u_int8_t *blksfree; 972 973 fs = ip->i_fs; 974 ump = ip->i_ump; 975 976 KASSERT(mutex_owned(&ump->um_lock)); 977 978 if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize)) 979 return (0); 980 frags = ffs_numfrags(fs, nsize); 981 bbase = ffs_fragnum(fs, bprev); 982 if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) { 983 /* cannot extend across a block boundary */ 984 return (0); 985 } 986 mutex_exit(&ump->um_lock); 987 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 988 (int)fs->fs_cgsize, B_MODIFY, &bp); 989 if (error) 990 goto fail; 991 cgp = (struct cg *)bp->b_data; 992 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) 993 goto fail; 994 cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); 995 if ((fs->fs_magic != FS_UFS1_MAGIC) || 996 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 997 cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); 998 bno = dtogd(fs, bprev); 999 blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); 1000 for (i = ffs_numfrags(fs, osize); i < frags; i++) 1001 if (isclr(blksfree, bno + i)) 1002 goto fail; 1003 /* 1004 * the current fragment can be extended 1005 * deduct the count on fragment being extended into 1006 * increase the count on the remaining fragment (if any) 1007 * allocate the extended piece 1008 */ 1009 for (i = frags; i < fs->fs_frag - bbase; i++) 1010 if (isclr(blksfree, bno + i)) 1011 break; 1012 ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); 1013 if (i != frags) 1014 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); 1015 mutex_enter(&ump->um_lock); 1016 for (i = ffs_numfrags(fs, osize); i < frags; i++) { 1017 clrbit(blksfree, bno + i); 1018 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); 1019 fs->fs_cstotal.cs_nffree--; 1020 fs->fs_cs(fs, cg).cs_nffree--; 1021 } 1022 fs->fs_fmod = 1; 1023 ACTIVECG_CLR(fs, cg); 1024 mutex_exit(&ump->um_lock); 1025 bdwrite(bp); 1026 return (bprev); 1027 1028 fail: 1029 if (bp != NULL) 1030 brelse(bp, 0); 1031 mutex_enter(&ump->um_lock); 1032 return (0); 1033 } 1034 1035 /* 1036 * Determine whether a block can be allocated. 1037 * 1038 * Check to see if a block of the appropriate size is available, 1039 * and if it is, allocate it. 1040 */ 1041 static daddr_t 1042 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int realsize, 1043 int flags) 1044 { 1045 struct ufsmount *ump; 1046 struct fs *fs = ip->i_fs; 1047 struct cg *cgp; 1048 struct buf *bp; 1049 int32_t bno; 1050 daddr_t blkno; 1051 int error, frags, allocsiz, i; 1052 u_int8_t *blksfree; 1053 const int needswap = UFS_FSNEEDSWAP(fs); 1054 1055 ump = ip->i_ump; 1056 1057 KASSERT(mutex_owned(&ump->um_lock)); 1058 1059 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1060 return (0); 1061 mutex_exit(&ump->um_lock); 1062 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1063 (int)fs->fs_cgsize, B_MODIFY, &bp); 1064 if (error) 1065 goto fail; 1066 cgp = (struct cg *)bp->b_data; 1067 if (!cg_chkmagic(cgp, needswap) || 1068 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1069 goto fail; 1070 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1071 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1072 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1073 cgp->cg_time = ufs_rw64(time_second, needswap); 1074 if (size == fs->fs_bsize) { 1075 mutex_enter(&ump->um_lock); 1076 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1077 ACTIVECG_CLR(fs, cg); 1078 mutex_exit(&ump->um_lock); 1079 1080 /* 1081 * If actually needed size is lower, free the extra blocks now. 1082 * This is safe to call here, there is no outside reference 1083 * to this block yet. It is not necessary to keep um_lock 1084 * locked. 1085 */ 1086 if (realsize != 0 && realsize < size) { 1087 ffs_blkfree_common(ip->i_ump, ip->i_fs, 1088 ip->i_devvp->v_rdev, 1089 bp, blkno + ffs_numfrags(fs, realsize), 1090 (long)(size - realsize), false); 1091 } 1092 1093 bdwrite(bp); 1094 return (blkno); 1095 } 1096 /* 1097 * check to see if any fragments are already available 1098 * allocsiz is the size which will be allocated, hacking 1099 * it down to a smaller size if necessary 1100 */ 1101 blksfree = cg_blksfree(cgp, needswap); 1102 frags = ffs_numfrags(fs, size); 1103 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1104 if (cgp->cg_frsum[allocsiz] != 0) 1105 break; 1106 if (allocsiz == fs->fs_frag) { 1107 /* 1108 * no fragments were available, so a block will be 1109 * allocated, and hacked up 1110 */ 1111 if (cgp->cg_cs.cs_nbfree == 0) 1112 goto fail; 1113 mutex_enter(&ump->um_lock); 1114 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1115 bno = dtogd(fs, blkno); 1116 for (i = frags; i < fs->fs_frag; i++) 1117 setbit(blksfree, bno + i); 1118 i = fs->fs_frag - frags; 1119 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1120 fs->fs_cstotal.cs_nffree += i; 1121 fs->fs_cs(fs, cg).cs_nffree += i; 1122 fs->fs_fmod = 1; 1123 ufs_add32(cgp->cg_frsum[i], 1, needswap); 1124 ACTIVECG_CLR(fs, cg); 1125 mutex_exit(&ump->um_lock); 1126 bdwrite(bp); 1127 return (blkno); 1128 } 1129 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1130 #if 0 1131 /* 1132 * XXX fvdl mapsearch will panic, and never return -1 1133 * also: returning NULL as daddr_t ? 1134 */ 1135 if (bno < 0) 1136 goto fail; 1137 #endif 1138 for (i = 0; i < frags; i++) 1139 clrbit(blksfree, bno + i); 1140 mutex_enter(&ump->um_lock); 1141 ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); 1142 fs->fs_cstotal.cs_nffree -= frags; 1143 fs->fs_cs(fs, cg).cs_nffree -= frags; 1144 fs->fs_fmod = 1; 1145 ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); 1146 if (frags != allocsiz) 1147 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); 1148 blkno = cgbase(fs, cg) + bno; 1149 ACTIVECG_CLR(fs, cg); 1150 mutex_exit(&ump->um_lock); 1151 bdwrite(bp); 1152 return blkno; 1153 1154 fail: 1155 if (bp != NULL) 1156 brelse(bp, 0); 1157 mutex_enter(&ump->um_lock); 1158 return (0); 1159 } 1160 1161 /* 1162 * Allocate a block in a cylinder group. 1163 * 1164 * This algorithm implements the following policy: 1165 * 1) allocate the requested block. 1166 * 2) allocate a rotationally optimal block in the same cylinder. 1167 * 3) allocate the next available block on the block rotor for the 1168 * specified cylinder group. 1169 * Note that this routine only allocates fs_bsize blocks; these 1170 * blocks may be fragmented by the routine that allocates them. 1171 */ 1172 static daddr_t 1173 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize, 1174 int flags) 1175 { 1176 struct fs *fs = ip->i_fs; 1177 struct cg *cgp; 1178 int cg; 1179 daddr_t blkno; 1180 int32_t bno; 1181 u_int8_t *blksfree; 1182 const int needswap = UFS_FSNEEDSWAP(fs); 1183 1184 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 1185 1186 cgp = (struct cg *)bp->b_data; 1187 blksfree = cg_blksfree(cgp, needswap); 1188 if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { 1189 bpref = ufs_rw32(cgp->cg_rotor, needswap); 1190 } else { 1191 bpref = ffs_blknum(fs, bpref); 1192 bno = dtogd(fs, bpref); 1193 /* 1194 * if the requested block is available, use it 1195 */ 1196 if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno))) 1197 goto gotit; 1198 /* 1199 * if the requested data block isn't available and we are 1200 * trying to allocate a contiguous file, return an error. 1201 */ 1202 if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) 1203 return (0); 1204 } 1205 1206 /* 1207 * Take the next available block in this cylinder group. 1208 */ 1209 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1210 #if 0 1211 /* 1212 * XXX jdolecek ffs_mapsearch() succeeds or panics 1213 */ 1214 if (bno < 0) 1215 return (0); 1216 #endif 1217 cgp->cg_rotor = ufs_rw32(bno, needswap); 1218 gotit: 1219 blkno = ffs_fragstoblks(fs, bno); 1220 ffs_clrblock(fs, blksfree, blkno); 1221 ffs_clusteracct(fs, cgp, blkno, -1); 1222 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1223 fs->fs_cstotal.cs_nbfree--; 1224 fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; 1225 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1226 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1227 int cylno; 1228 cylno = old_cbtocylno(fs, bno); 1229 KASSERT(cylno >= 0); 1230 KASSERT(cylno < fs->fs_old_ncyl); 1231 KASSERT(old_cbtorpos(fs, bno) >= 0); 1232 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); 1233 ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, 1234 needswap); 1235 ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); 1236 } 1237 fs->fs_fmod = 1; 1238 cg = ufs_rw32(cgp->cg_cgx, needswap); 1239 blkno = cgbase(fs, cg) + bno; 1240 return (blkno); 1241 } 1242 1243 /* 1244 * Determine whether an inode can be allocated. 1245 * 1246 * Check to see if an inode is available, and if it is, 1247 * allocate it using the following policy: 1248 * 1) allocate the requested inode. 1249 * 2) allocate the next available inode after the requested 1250 * inode in the specified cylinder group. 1251 */ 1252 static daddr_t 1253 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int realsize, 1254 int flags) 1255 { 1256 struct ufsmount *ump = ip->i_ump; 1257 struct fs *fs = ip->i_fs; 1258 struct cg *cgp; 1259 struct buf *bp, *ibp; 1260 u_int8_t *inosused; 1261 int error, start, len, loc, map, i; 1262 int32_t initediblk; 1263 daddr_t nalloc; 1264 struct ufs2_dinode *dp2; 1265 const int needswap = UFS_FSNEEDSWAP(fs); 1266 1267 KASSERT(mutex_owned(&ump->um_lock)); 1268 UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); 1269 1270 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1271 return (0); 1272 mutex_exit(&ump->um_lock); 1273 ibp = NULL; 1274 initediblk = -1; 1275 retry: 1276 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1277 (int)fs->fs_cgsize, B_MODIFY, &bp); 1278 if (error) 1279 goto fail; 1280 cgp = (struct cg *)bp->b_data; 1281 if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) 1282 goto fail; 1283 1284 if (ibp != NULL && 1285 initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { 1286 /* Another thread allocated more inodes so we retry the test. */ 1287 brelse(ibp, 0); 1288 ibp = NULL; 1289 } 1290 /* 1291 * Check to see if we need to initialize more inodes. 1292 */ 1293 if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { 1294 initediblk = ufs_rw32(cgp->cg_initediblk, needswap); 1295 nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); 1296 if (nalloc + FFS_INOPB(fs) > initediblk && 1297 initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { 1298 /* 1299 * We have to release the cg buffer here to prevent 1300 * a deadlock when reading the inode block will 1301 * run a copy-on-write that might use this cg. 1302 */ 1303 brelse(bp, 0); 1304 bp = NULL; 1305 error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs, 1306 ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), 1307 FFS_NOBLK, fs->fs_bsize, false, &ibp); 1308 if (error) 1309 goto fail; 1310 goto retry; 1311 } 1312 } 1313 1314 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1315 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1316 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1317 cgp->cg_time = ufs_rw64(time_second, needswap); 1318 inosused = cg_inosused(cgp, needswap); 1319 if (ipref) { 1320 ipref %= fs->fs_ipg; 1321 if (isclr(inosused, ipref)) 1322 goto gotit; 1323 } 1324 start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY; 1325 len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap), 1326 NBBY); 1327 loc = skpc(0xff, len, &inosused[start]); 1328 if (loc == 0) { 1329 len = start + 1; 1330 start = 0; 1331 loc = skpc(0xff, len, &inosused[0]); 1332 if (loc == 0) { 1333 panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s", 1334 __func__, cg, ufs_rw32(cgp->cg_irotor, needswap), 1335 fs->fs_fsmnt); 1336 /* NOTREACHED */ 1337 } 1338 } 1339 i = start + len - loc; 1340 map = inosused[i] ^ 0xff; 1341 if (map == 0) { 1342 panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt); 1343 } 1344 ipref = i * NBBY + ffs(map) - 1; 1345 cgp->cg_irotor = ufs_rw32(ipref, needswap); 1346 gotit: 1347 UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, 1348 mode); 1349 /* 1350 * Check to see if we need to initialize more inodes. 1351 */ 1352 if (ibp != NULL) { 1353 KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); 1354 memset(ibp->b_data, 0, fs->fs_bsize); 1355 dp2 = (struct ufs2_dinode *)(ibp->b_data); 1356 for (i = 0; i < FFS_INOPB(fs); i++) { 1357 /* 1358 * Don't bother to swap, it's supposed to be 1359 * random, after all. 1360 */ 1361 dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; 1362 dp2++; 1363 } 1364 initediblk += FFS_INOPB(fs); 1365 cgp->cg_initediblk = ufs_rw32(initediblk, needswap); 1366 } 1367 1368 mutex_enter(&ump->um_lock); 1369 ACTIVECG_CLR(fs, cg); 1370 setbit(inosused, ipref); 1371 ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); 1372 fs->fs_cstotal.cs_nifree--; 1373 fs->fs_cs(fs, cg).cs_nifree--; 1374 fs->fs_fmod = 1; 1375 if ((mode & IFMT) == IFDIR) { 1376 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); 1377 fs->fs_cstotal.cs_ndir++; 1378 fs->fs_cs(fs, cg).cs_ndir++; 1379 } 1380 mutex_exit(&ump->um_lock); 1381 if (ibp != NULL) { 1382 bwrite(ibp); 1383 bwrite(bp); 1384 } else 1385 bdwrite(bp); 1386 return (cg * fs->fs_ipg + ipref); 1387 fail: 1388 if (bp != NULL) 1389 brelse(bp, 0); 1390 if (ibp != NULL) 1391 brelse(ibp, 0); 1392 mutex_enter(&ump->um_lock); 1393 return (0); 1394 } 1395 1396 /* 1397 * Allocate a block or fragment. 1398 * 1399 * The specified block or fragment is removed from the 1400 * free map, possibly fragmenting a block in the process. 1401 * 1402 * This implementation should mirror fs_blkfree 1403 * 1404 * => um_lock not held on entry or exit 1405 */ 1406 int 1407 ffs_blkalloc(struct inode *ip, daddr_t bno, long size) 1408 { 1409 int error; 1410 1411 error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, 1412 ip->i_dev, ip->i_uid); 1413 if (error) 1414 return error; 1415 1416 return ffs_blkalloc_ump(ip->i_ump, bno, size); 1417 } 1418 1419 int 1420 ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) 1421 { 1422 struct fs *fs = ump->um_fs; 1423 struct cg *cgp; 1424 struct buf *bp; 1425 int32_t fragno, cgbno; 1426 int i, error, cg, blk, frags, bbase; 1427 u_int8_t *blksfree; 1428 const int needswap = UFS_FSNEEDSWAP(fs); 1429 1430 KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 && 1431 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag); 1432 KASSERT(bno < fs->fs_size); 1433 1434 cg = dtog(fs, bno); 1435 error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1436 (int)fs->fs_cgsize, B_MODIFY, &bp); 1437 if (error) { 1438 return error; 1439 } 1440 cgp = (struct cg *)bp->b_data; 1441 if (!cg_chkmagic(cgp, needswap)) { 1442 brelse(bp, 0); 1443 return EIO; 1444 } 1445 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1446 cgp->cg_time = ufs_rw64(time_second, needswap); 1447 cgbno = dtogd(fs, bno); 1448 blksfree = cg_blksfree(cgp, needswap); 1449 1450 mutex_enter(&ump->um_lock); 1451 if (size == fs->fs_bsize) { 1452 fragno = ffs_fragstoblks(fs, cgbno); 1453 if (!ffs_isblock(fs, blksfree, fragno)) { 1454 mutex_exit(&ump->um_lock); 1455 brelse(bp, 0); 1456 return EBUSY; 1457 } 1458 ffs_clrblock(fs, blksfree, fragno); 1459 ffs_clusteracct(fs, cgp, fragno, -1); 1460 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1461 fs->fs_cstotal.cs_nbfree--; 1462 fs->fs_cs(fs, cg).cs_nbfree--; 1463 } else { 1464 bbase = cgbno - ffs_fragnum(fs, cgbno); 1465 1466 frags = ffs_numfrags(fs, size); 1467 for (i = 0; i < frags; i++) { 1468 if (isclr(blksfree, cgbno + i)) { 1469 mutex_exit(&ump->um_lock); 1470 brelse(bp, 0); 1471 return EBUSY; 1472 } 1473 } 1474 /* 1475 * if a complete block is being split, account for it 1476 */ 1477 fragno = ffs_fragstoblks(fs, bbase); 1478 if (ffs_isblock(fs, blksfree, fragno)) { 1479 ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); 1480 fs->fs_cstotal.cs_nffree += fs->fs_frag; 1481 fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; 1482 ffs_clusteracct(fs, cgp, fragno, -1); 1483 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1484 fs->fs_cstotal.cs_nbfree--; 1485 fs->fs_cs(fs, cg).cs_nbfree--; 1486 } 1487 /* 1488 * decrement the counts associated with the old frags 1489 */ 1490 blk = blkmap(fs, blksfree, bbase); 1491 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1492 /* 1493 * allocate the fragment 1494 */ 1495 for (i = 0; i < frags; i++) { 1496 clrbit(blksfree, cgbno + i); 1497 } 1498 ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); 1499 fs->fs_cstotal.cs_nffree -= i; 1500 fs->fs_cs(fs, cg).cs_nffree -= i; 1501 /* 1502 * add back in counts associated with the new frags 1503 */ 1504 blk = blkmap(fs, blksfree, bbase); 1505 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1506 } 1507 fs->fs_fmod = 1; 1508 ACTIVECG_CLR(fs, cg); 1509 mutex_exit(&ump->um_lock); 1510 bdwrite(bp); 1511 return 0; 1512 } 1513 1514 /* 1515 * Free a block or fragment. 1516 * 1517 * The specified block or fragment is placed back in the 1518 * free map. If a fragment is deallocated, a possible 1519 * block reassembly is checked. 1520 * 1521 * => um_lock not held on entry or exit 1522 */ 1523 static void 1524 ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size) 1525 { 1526 struct cg *cgp; 1527 struct buf *bp; 1528 struct ufsmount *ump; 1529 daddr_t cgblkno; 1530 int error, cg; 1531 dev_t dev; 1532 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1533 const int needswap = UFS_FSNEEDSWAP(fs); 1534 1535 KASSERT(!devvp_is_snapshot); 1536 1537 cg = dtog(fs, bno); 1538 dev = devvp->v_rdev; 1539 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1540 KASSERT(fs == ump->um_fs); 1541 cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1542 1543 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1544 B_MODIFY, &bp); 1545 if (error) { 1546 return; 1547 } 1548 cgp = (struct cg *)bp->b_data; 1549 if (!cg_chkmagic(cgp, needswap)) { 1550 brelse(bp, 0); 1551 return; 1552 } 1553 1554 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1555 1556 bdwrite(bp); 1557 } 1558 1559 struct discardopdata { 1560 struct work wk; /* must be first */ 1561 struct vnode *devvp; 1562 daddr_t bno; 1563 long size; 1564 }; 1565 1566 struct discarddata { 1567 struct fs *fs; 1568 struct discardopdata *entry; 1569 long maxsize; 1570 kmutex_t entrylk; 1571 struct workqueue *wq; 1572 int wqcnt, wqdraining; 1573 kmutex_t wqlk; 1574 kcondvar_t wqcv; 1575 /* timer for flush? */ 1576 }; 1577 1578 static void 1579 ffs_blkfree_td(struct fs *fs, struct discardopdata *td) 1580 { 1581 struct mount *mp = spec_node_getmountedfs(td->devvp); 1582 long todo; 1583 int error; 1584 1585 while (td->size) { 1586 todo = min(td->size, 1587 ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno)))); 1588 error = UFS_WAPBL_BEGIN(mp); 1589 if (error) { 1590 printf("ffs: failed to begin wapbl transaction" 1591 " for discard: %d\n", error); 1592 break; 1593 } 1594 ffs_blkfree_cg(fs, td->devvp, td->bno, todo); 1595 UFS_WAPBL_END(mp); 1596 td->bno += ffs_numfrags(fs, todo); 1597 td->size -= todo; 1598 } 1599 } 1600 1601 static void 1602 ffs_discardcb(struct work *wk, void *arg) 1603 { 1604 struct discardopdata *td = (void *)wk; 1605 struct discarddata *ts = arg; 1606 struct fs *fs = ts->fs; 1607 off_t start, len; 1608 #ifdef TRIMDEBUG 1609 int error; 1610 #endif 1611 1612 /* like FSBTODB but emits bytes; XXX move to fs.h */ 1613 #ifndef FFS_FSBTOBYTES 1614 #define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift) 1615 #endif 1616 1617 start = FFS_FSBTOBYTES(fs, td->bno); 1618 len = td->size; 1619 #ifdef TRIMDEBUG 1620 error = 1621 #endif 1622 VOP_FDISCARD(td->devvp, start, len); 1623 #ifdef TRIMDEBUG 1624 printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error); 1625 #endif 1626 1627 ffs_blkfree_td(fs, td); 1628 kmem_free(td, sizeof(*td)); 1629 mutex_enter(&ts->wqlk); 1630 ts->wqcnt--; 1631 if (ts->wqdraining && !ts->wqcnt) 1632 cv_signal(&ts->wqcv); 1633 mutex_exit(&ts->wqlk); 1634 } 1635 1636 void * 1637 ffs_discard_init(struct vnode *devvp, struct fs *fs) 1638 { 1639 struct discarddata *ts; 1640 int error; 1641 1642 ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); 1643 error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts, 1644 0, 0, 0); 1645 if (error) { 1646 kmem_free(ts, sizeof (*ts)); 1647 return NULL; 1648 } 1649 mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE); 1650 mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE); 1651 cv_init(&ts->wqcv, "trimwqcv"); 1652 ts->maxsize = 100*1024; /* XXX */ 1653 ts->fs = fs; 1654 return ts; 1655 } 1656 1657 void 1658 ffs_discard_finish(void *vts, int flags) 1659 { 1660 struct discarddata *ts = vts; 1661 struct discardopdata *td = NULL; 1662 1663 /* wait for workqueue to drain */ 1664 mutex_enter(&ts->wqlk); 1665 if (ts->wqcnt) { 1666 ts->wqdraining = 1; 1667 cv_wait(&ts->wqcv, &ts->wqlk); 1668 } 1669 mutex_exit(&ts->wqlk); 1670 1671 mutex_enter(&ts->entrylk); 1672 if (ts->entry) { 1673 td = ts->entry; 1674 ts->entry = NULL; 1675 } 1676 mutex_exit(&ts->entrylk); 1677 if (td) { 1678 /* XXX don't tell disk, its optional */ 1679 ffs_blkfree_td(ts->fs, td); 1680 #ifdef TRIMDEBUG 1681 printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size); 1682 #endif 1683 kmem_free(td, sizeof(*td)); 1684 } 1685 1686 cv_destroy(&ts->wqcv); 1687 mutex_destroy(&ts->entrylk); 1688 mutex_destroy(&ts->wqlk); 1689 workqueue_destroy(ts->wq); 1690 kmem_free(ts, sizeof(*ts)); 1691 } 1692 1693 void 1694 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1695 ino_t inum) 1696 { 1697 struct ufsmount *ump; 1698 int error; 1699 dev_t dev; 1700 struct discarddata *ts; 1701 struct discardopdata *td; 1702 1703 dev = devvp->v_rdev; 1704 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1705 if (ffs_snapblkfree(fs, devvp, bno, size, inum)) 1706 return; 1707 1708 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1709 if (error) 1710 return; 1711 1712 if (!ump->um_discarddata) { 1713 ffs_blkfree_cg(fs, devvp, bno, size); 1714 return; 1715 } 1716 1717 #ifdef TRIMDEBUG 1718 printf("blkfree(%" PRId64 ",%ld)\n", bno, size); 1719 #endif 1720 ts = ump->um_discarddata; 1721 td = NULL; 1722 1723 mutex_enter(&ts->entrylk); 1724 if (ts->entry) { 1725 td = ts->entry; 1726 /* ffs deallocs backwards, check for prepend only */ 1727 if (td->bno == bno + ffs_numfrags(fs, size) 1728 && td->size + size <= ts->maxsize) { 1729 td->bno = bno; 1730 td->size += size; 1731 if (td->size < ts->maxsize) { 1732 #ifdef TRIMDEBUG 1733 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1734 #endif 1735 mutex_exit(&ts->entrylk); 1736 return; 1737 } 1738 size = 0; /* mark done */ 1739 } 1740 ts->entry = NULL; 1741 } 1742 mutex_exit(&ts->entrylk); 1743 1744 if (td) { 1745 #ifdef TRIMDEBUG 1746 printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size); 1747 #endif 1748 mutex_enter(&ts->wqlk); 1749 ts->wqcnt++; 1750 mutex_exit(&ts->wqlk); 1751 workqueue_enqueue(ts->wq, &td->wk, NULL); 1752 } 1753 if (!size) 1754 return; 1755 1756 td = kmem_alloc(sizeof(*td), KM_SLEEP); 1757 td->devvp = devvp; 1758 td->bno = bno; 1759 td->size = size; 1760 1761 if (td->size < ts->maxsize) { /* XXX always the case */ 1762 mutex_enter(&ts->entrylk); 1763 if (!ts->entry) { /* possible race? */ 1764 #ifdef TRIMDEBUG 1765 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1766 #endif 1767 ts->entry = td; 1768 td = NULL; 1769 } 1770 mutex_exit(&ts->entrylk); 1771 } 1772 if (td) { 1773 #ifdef TRIMDEBUG 1774 printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size); 1775 #endif 1776 mutex_enter(&ts->wqlk); 1777 ts->wqcnt++; 1778 mutex_exit(&ts->wqlk); 1779 workqueue_enqueue(ts->wq, &td->wk, NULL); 1780 } 1781 } 1782 1783 /* 1784 * Free a block or fragment from a snapshot cg copy. 1785 * 1786 * The specified block or fragment is placed back in the 1787 * free map. If a fragment is deallocated, a possible 1788 * block reassembly is checked. 1789 * 1790 * => um_lock not held on entry or exit 1791 */ 1792 void 1793 ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1794 ino_t inum) 1795 { 1796 struct cg *cgp; 1797 struct buf *bp; 1798 struct ufsmount *ump; 1799 daddr_t cgblkno; 1800 int error, cg; 1801 dev_t dev; 1802 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1803 const int needswap = UFS_FSNEEDSWAP(fs); 1804 1805 KASSERT(devvp_is_snapshot); 1806 1807 cg = dtog(fs, bno); 1808 dev = VTOI(devvp)->i_devvp->v_rdev; 1809 ump = VFSTOUFS(devvp->v_mount); 1810 cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg)); 1811 1812 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1813 if (error) 1814 return; 1815 1816 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1817 B_MODIFY, &bp); 1818 if (error) { 1819 return; 1820 } 1821 cgp = (struct cg *)bp->b_data; 1822 if (!cg_chkmagic(cgp, needswap)) { 1823 brelse(bp, 0); 1824 return; 1825 } 1826 1827 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1828 1829 bdwrite(bp); 1830 } 1831 1832 static void 1833 ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 1834 struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) 1835 { 1836 struct cg *cgp; 1837 int32_t fragno, cgbno; 1838 int i, cg, blk, frags, bbase; 1839 u_int8_t *blksfree; 1840 const int needswap = UFS_FSNEEDSWAP(fs); 1841 1842 cg = dtog(fs, bno); 1843 cgp = (struct cg *)bp->b_data; 1844 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1845 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1846 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1847 cgp->cg_time = ufs_rw64(time_second, needswap); 1848 cgbno = dtogd(fs, bno); 1849 blksfree = cg_blksfree(cgp, needswap); 1850 mutex_enter(&ump->um_lock); 1851 if (size == fs->fs_bsize) { 1852 fragno = ffs_fragstoblks(fs, cgbno); 1853 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 1854 if (devvp_is_snapshot) { 1855 mutex_exit(&ump->um_lock); 1856 return; 1857 } 1858 panic("%s: freeing free block: dev = 0x%llx, block = %" 1859 PRId64 ", fs = %s", __func__, 1860 (unsigned long long)dev, bno, fs->fs_fsmnt); 1861 } 1862 ffs_setblock(fs, blksfree, fragno); 1863 ffs_clusteracct(fs, cgp, fragno, 1); 1864 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1865 fs->fs_cstotal.cs_nbfree++; 1866 fs->fs_cs(fs, cg).cs_nbfree++; 1867 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1868 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1869 i = old_cbtocylno(fs, cgbno); 1870 KASSERT(i >= 0); 1871 KASSERT(i < fs->fs_old_ncyl); 1872 KASSERT(old_cbtorpos(fs, cgbno) >= 0); 1873 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); 1874 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, 1875 needswap); 1876 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1877 } 1878 } else { 1879 bbase = cgbno - ffs_fragnum(fs, cgbno); 1880 /* 1881 * decrement the counts associated with the old frags 1882 */ 1883 blk = blkmap(fs, blksfree, bbase); 1884 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1885 /* 1886 * deallocate the fragment 1887 */ 1888 frags = ffs_numfrags(fs, size); 1889 for (i = 0; i < frags; i++) { 1890 if (isset(blksfree, cgbno + i)) { 1891 panic("%s: freeing free frag: " 1892 "dev = 0x%llx, block = %" PRId64 1893 ", fs = %s", __func__, 1894 (unsigned long long)dev, bno + i, 1895 fs->fs_fsmnt); 1896 } 1897 setbit(blksfree, cgbno + i); 1898 } 1899 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1900 fs->fs_cstotal.cs_nffree += i; 1901 fs->fs_cs(fs, cg).cs_nffree += i; 1902 /* 1903 * add back in counts associated with the new frags 1904 */ 1905 blk = blkmap(fs, blksfree, bbase); 1906 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1907 /* 1908 * if a complete block has been reassembled, account for it 1909 */ 1910 fragno = ffs_fragstoblks(fs, bbase); 1911 if (ffs_isblock(fs, blksfree, fragno)) { 1912 ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); 1913 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 1914 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 1915 ffs_clusteracct(fs, cgp, fragno, 1); 1916 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1917 fs->fs_cstotal.cs_nbfree++; 1918 fs->fs_cs(fs, cg).cs_nbfree++; 1919 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1920 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1921 i = old_cbtocylno(fs, bbase); 1922 KASSERT(i >= 0); 1923 KASSERT(i < fs->fs_old_ncyl); 1924 KASSERT(old_cbtorpos(fs, bbase) >= 0); 1925 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); 1926 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, 1927 bbase)], 1, needswap); 1928 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1929 } 1930 } 1931 } 1932 fs->fs_fmod = 1; 1933 ACTIVECG_CLR(fs, cg); 1934 mutex_exit(&ump->um_lock); 1935 } 1936 1937 /* 1938 * Free an inode. 1939 */ 1940 int 1941 ffs_vfree(struct vnode *vp, ino_t ino, int mode) 1942 { 1943 1944 return ffs_freefile(vp->v_mount, ino, mode); 1945 } 1946 1947 /* 1948 * Do the actual free operation. 1949 * The specified inode is placed back in the free map. 1950 * 1951 * => um_lock not held on entry or exit 1952 */ 1953 int 1954 ffs_freefile(struct mount *mp, ino_t ino, int mode) 1955 { 1956 struct ufsmount *ump = VFSTOUFS(mp); 1957 struct fs *fs = ump->um_fs; 1958 struct vnode *devvp; 1959 struct cg *cgp; 1960 struct buf *bp; 1961 int error, cg; 1962 daddr_t cgbno; 1963 dev_t dev; 1964 const int needswap = UFS_FSNEEDSWAP(fs); 1965 1966 cg = ino_to_cg(fs, ino); 1967 devvp = ump->um_devvp; 1968 dev = devvp->v_rdev; 1969 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1970 1971 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 1972 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 1973 (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); 1974 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 1975 B_MODIFY, &bp); 1976 if (error) { 1977 return (error); 1978 } 1979 cgp = (struct cg *)bp->b_data; 1980 if (!cg_chkmagic(cgp, needswap)) { 1981 brelse(bp, 0); 1982 return (0); 1983 } 1984 1985 ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); 1986 1987 bdwrite(bp); 1988 1989 return 0; 1990 } 1991 1992 int 1993 ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) 1994 { 1995 struct ufsmount *ump; 1996 struct cg *cgp; 1997 struct buf *bp; 1998 int error, cg; 1999 daddr_t cgbno; 2000 dev_t dev; 2001 const int needswap = UFS_FSNEEDSWAP(fs); 2002 2003 KASSERT(devvp->v_type != VBLK); 2004 2005 cg = ino_to_cg(fs, ino); 2006 dev = VTOI(devvp)->i_devvp->v_rdev; 2007 ump = VFSTOUFS(devvp->v_mount); 2008 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2009 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2010 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 2011 (unsigned long long)dev, (unsigned long long)ino, 2012 fs->fs_fsmnt); 2013 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2014 B_MODIFY, &bp); 2015 if (error) { 2016 return (error); 2017 } 2018 cgp = (struct cg *)bp->b_data; 2019 if (!cg_chkmagic(cgp, needswap)) { 2020 brelse(bp, 0); 2021 return (0); 2022 } 2023 ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); 2024 2025 bdwrite(bp); 2026 2027 return 0; 2028 } 2029 2030 static void 2031 ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 2032 struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) 2033 { 2034 int cg; 2035 struct cg *cgp; 2036 u_int8_t *inosused; 2037 const int needswap = UFS_FSNEEDSWAP(fs); 2038 2039 cg = ino_to_cg(fs, ino); 2040 cgp = (struct cg *)bp->b_data; 2041 cgp->cg_old_time = ufs_rw32(time_second, needswap); 2042 if ((fs->fs_magic != FS_UFS1_MAGIC) || 2043 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 2044 cgp->cg_time = ufs_rw64(time_second, needswap); 2045 inosused = cg_inosused(cgp, needswap); 2046 ino %= fs->fs_ipg; 2047 if (isclr(inosused, ino)) { 2048 printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", 2049 (unsigned long long)dev, (unsigned long long)ino + 2050 cg * fs->fs_ipg, fs->fs_fsmnt); 2051 if (fs->fs_ronly == 0) 2052 panic("%s: freeing free inode", __func__); 2053 } 2054 clrbit(inosused, ino); 2055 if (!devvp_is_snapshot) 2056 UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, 2057 ino + cg * fs->fs_ipg, mode); 2058 if (ino < ufs_rw32(cgp->cg_irotor, needswap)) 2059 cgp->cg_irotor = ufs_rw32(ino, needswap); 2060 ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); 2061 mutex_enter(&ump->um_lock); 2062 fs->fs_cstotal.cs_nifree++; 2063 fs->fs_cs(fs, cg).cs_nifree++; 2064 if ((mode & IFMT) == IFDIR) { 2065 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); 2066 fs->fs_cstotal.cs_ndir--; 2067 fs->fs_cs(fs, cg).cs_ndir--; 2068 } 2069 fs->fs_fmod = 1; 2070 ACTIVECG_CLR(fs, cg); 2071 mutex_exit(&ump->um_lock); 2072 } 2073 2074 /* 2075 * Check to see if a file is free. 2076 */ 2077 int 2078 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) 2079 { 2080 struct cg *cgp; 2081 struct buf *bp; 2082 daddr_t cgbno; 2083 int ret, cg; 2084 u_int8_t *inosused; 2085 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 2086 2087 KASSERT(devvp_is_snapshot); 2088 2089 cg = ino_to_cg(fs, ino); 2090 if (devvp_is_snapshot) 2091 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2092 else 2093 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2094 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2095 return 1; 2096 if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) { 2097 return 1; 2098 } 2099 cgp = (struct cg *)bp->b_data; 2100 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { 2101 brelse(bp, 0); 2102 return 1; 2103 } 2104 inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); 2105 ino %= fs->fs_ipg; 2106 ret = isclr(inosused, ino); 2107 brelse(bp, 0); 2108 return ret; 2109 } 2110 2111 /* 2112 * Find a block of the specified size in the specified cylinder group. 2113 * 2114 * It is a panic if a request is made to find a block if none are 2115 * available. 2116 */ 2117 static int32_t 2118 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) 2119 { 2120 int32_t bno; 2121 int start, len, loc, i; 2122 int blk, field, subfield, pos; 2123 int ostart, olen; 2124 u_int8_t *blksfree; 2125 const int needswap = UFS_FSNEEDSWAP(fs); 2126 2127 /* KASSERT(mutex_owned(&ump->um_lock)); */ 2128 2129 /* 2130 * find the fragment by searching through the free block 2131 * map for an appropriate bit pattern 2132 */ 2133 if (bpref) 2134 start = dtogd(fs, bpref) / NBBY; 2135 else 2136 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; 2137 blksfree = cg_blksfree(cgp, needswap); 2138 len = howmany(fs->fs_fpg, NBBY) - start; 2139 ostart = start; 2140 olen = len; 2141 loc = scanc((u_int)len, 2142 (const u_char *)&blksfree[start], 2143 (const u_char *)fragtbl[fs->fs_frag], 2144 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2145 if (loc == 0) { 2146 len = start + 1; 2147 start = 0; 2148 loc = scanc((u_int)len, 2149 (const u_char *)&blksfree[0], 2150 (const u_char *)fragtbl[fs->fs_frag], 2151 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2152 if (loc == 0) { 2153 panic("%s: map corrupted: start=%d, len=%d, " 2154 "fs = %s, offset=%d/%ld, cg %d", __func__, 2155 ostart, olen, fs->fs_fsmnt, 2156 ufs_rw32(cgp->cg_freeoff, needswap), 2157 (long)blksfree - (long)cgp, cgp->cg_cgx); 2158 /* NOTREACHED */ 2159 } 2160 } 2161 bno = (start + len - loc) * NBBY; 2162 cgp->cg_frotor = ufs_rw32(bno, needswap); 2163 /* 2164 * found the byte in the map 2165 * sift through the bits to find the selected frag 2166 */ 2167 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2168 blk = blkmap(fs, blksfree, bno); 2169 blk <<= 1; 2170 field = around[allocsiz]; 2171 subfield = inside[allocsiz]; 2172 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2173 if ((blk & field) == subfield) 2174 return (bno + pos); 2175 field <<= 1; 2176 subfield <<= 1; 2177 } 2178 } 2179 panic("%s: block not in map: bno=%d, fs=%s", __func__, 2180 bno, fs->fs_fsmnt); 2181 /* return (-1); */ 2182 } 2183 2184 /* 2185 * Fserr prints the name of a file system with an error diagnostic. 2186 * 2187 * The form of the error message is: 2188 * fs: error message 2189 */ 2190 static void 2191 ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp) 2192 { 2193 KASSERT(cred != NULL); 2194 2195 if (cred == NOCRED || cred == FSCRED) { 2196 log(LOG_ERR, "pid %d, command %s, on %s: %s\n", 2197 curproc->p_pid, curproc->p_comm, 2198 fs->fs_fsmnt, cp); 2199 } else { 2200 log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", 2201 kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm, 2202 fs->fs_fsmnt, cp); 2203 } 2204 } 2205