1 /* $NetBSD: ffs_alloc.c,v 1.145 2013/11/12 03:29:22 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2002 Networks Associates Technology, Inc. 34 * All rights reserved. 35 * 36 * This software was developed for the FreeBSD Project by Marshall 37 * Kirk McKusick and Network Associates Laboratories, the Security 38 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 39 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 40 * research program 41 * 42 * Copyright (c) 1982, 1986, 1989, 1993 43 * The Regents of the University of California. All rights reserved. 44 * 45 * Redistribution and use in source and binary forms, with or without 46 * modification, are permitted provided that the following conditions 47 * are met: 48 * 1. Redistributions of source code must retain the above copyright 49 * notice, this list of conditions and the following disclaimer. 50 * 2. Redistributions in binary form must reproduce the above copyright 51 * notice, this list of conditions and the following disclaimer in the 52 * documentation and/or other materials provided with the distribution. 53 * 3. Neither the name of the University nor the names of its contributors 54 * may be used to endorse or promote products derived from this software 55 * without specific prior written permission. 56 * 57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * SUCH DAMAGE. 68 * 69 * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.145 2013/11/12 03:29:22 dholland Exp $"); 74 75 #if defined(_KERNEL_OPT) 76 #include "opt_ffs.h" 77 #include "opt_quota.h" 78 #include "opt_uvm_page_trkown.h" 79 #endif 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/buf.h> 84 #include <sys/cprng.h> 85 #include <sys/fstrans.h> 86 #include <sys/kauth.h> 87 #include <sys/kernel.h> 88 #include <sys/mount.h> 89 #include <sys/proc.h> 90 #include <sys/syslog.h> 91 #include <sys/vnode.h> 92 #include <sys/wapbl.h> 93 94 #include <miscfs/specfs/specdev.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/ufsmount.h> 97 #include <ufs/ufs/inode.h> 98 #include <ufs/ufs/ufs_extern.h> 99 #include <ufs/ufs/ufs_bswap.h> 100 #include <ufs/ufs/ufs_wapbl.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 105 #ifdef UVM_PAGE_TRKOWN 106 #include <uvm/uvm.h> 107 #endif 108 109 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int); 110 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int); 111 static ino_t ffs_dirpref(struct inode *); 112 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); 113 static void ffs_fserr(struct fs *, u_int, const char *); 114 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, 115 daddr_t (*)(struct inode *, int, daddr_t, int, int)); 116 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int); 117 static int32_t ffs_mapsearch(struct fs *, struct cg *, 118 daddr_t, int); 119 static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, 120 daddr_t, long, bool); 121 static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, 122 int, bool); 123 124 /* if 1, changes in optimalization strategy are logged */ 125 int ffs_log_changeopt = 0; 126 127 /* in ffs_tables.c */ 128 extern const int inside[], around[]; 129 extern const u_char * const fragtbl[]; 130 131 /* Basic consistency check for block allocations */ 132 static int 133 ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, 134 long size, dev_t dev, ino_t inum) 135 { 136 if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 || 137 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) { 138 printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, " 139 "size = %ld, fs = %s\n", 140 (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); 141 panic("%s: bad size", func); 142 } 143 144 if (bno >= fs->fs_size) { 145 printf("bad block %" PRId64 ", ino %llu\n", bno, 146 (unsigned long long)inum); 147 ffs_fserr(fs, inum, "bad block"); 148 return EINVAL; 149 } 150 return 0; 151 } 152 153 /* 154 * Allocate a block in the file system. 155 * 156 * The size of the requested block is given, which must be some 157 * multiple of fs_fsize and <= fs_bsize. 158 * A preference may be optionally specified. If a preference is given 159 * the following hierarchy is used to allocate a block: 160 * 1) allocate the requested block. 161 * 2) allocate a rotationally optimal block in the same cylinder. 162 * 3) allocate a block in the same cylinder group. 163 * 4) quadradically rehash into other cylinder groups, until an 164 * available block is located. 165 * If no block preference is given the following hierarchy is used 166 * to allocate a block: 167 * 1) allocate a block in the cylinder group that contains the 168 * inode for the file. 169 * 2) quadradically rehash into other cylinder groups, until an 170 * available block is located. 171 * 172 * => called with um_lock held 173 * => releases um_lock before returning 174 */ 175 int 176 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags, 177 kauth_cred_t cred, daddr_t *bnp) 178 { 179 struct ufsmount *ump; 180 struct fs *fs; 181 daddr_t bno; 182 int cg; 183 #if defined(QUOTA) || defined(QUOTA2) 184 int error; 185 #endif 186 187 fs = ip->i_fs; 188 ump = ip->i_ump; 189 190 KASSERT(mutex_owned(&ump->um_lock)); 191 192 #ifdef UVM_PAGE_TRKOWN 193 194 /* 195 * Sanity-check that allocations within the file size 196 * do not allow other threads to read the stale contents 197 * of newly allocated blocks. 198 * Usually pages will exist to cover the new allocation. 199 * There is an optimization in ffs_write() where we skip 200 * creating pages if several conditions are met: 201 * - the file must not be mapped (in any user address space). 202 * - the write must cover whole pages and whole blocks. 203 * If those conditions are not met then pages must exist and 204 * be locked by the current thread. 205 */ 206 207 if (ITOV(ip)->v_type == VREG && 208 ffs_lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) { 209 struct vm_page *pg; 210 struct vnode *vp = ITOV(ip); 211 struct uvm_object *uobj = &vp->v_uobj; 212 voff_t off = trunc_page(ffs_lblktosize(fs, lbn)); 213 voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size); 214 215 mutex_enter(uobj->vmobjlock); 216 while (off < endoff) { 217 pg = uvm_pagelookup(uobj, off); 218 KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 && 219 (size & PAGE_MASK) == 0 && 220 ffs_blkoff(fs, size) == 0) || 221 (pg != NULL && pg->owner == curproc->p_pid && 222 pg->lowner == curlwp->l_lid)); 223 off += PAGE_SIZE; 224 } 225 mutex_exit(uobj->vmobjlock); 226 } 227 #endif 228 229 *bnp = 0; 230 #ifdef DIAGNOSTIC 231 if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0) { 232 printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n", 233 (unsigned long long)ip->i_dev, fs->fs_bsize, size, 234 fs->fs_fsmnt); 235 panic("ffs_alloc: bad size"); 236 } 237 if (cred == NOCRED) 238 panic("ffs_alloc: missing credential"); 239 #endif /* DIAGNOSTIC */ 240 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 241 goto nospace; 242 if (freespace(fs, fs->fs_minfree) <= 0 && 243 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 244 NULL, NULL) != 0) 245 goto nospace; 246 #if defined(QUOTA) || defined(QUOTA2) 247 mutex_exit(&ump->um_lock); 248 if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) 249 return (error); 250 mutex_enter(&ump->um_lock); 251 #endif 252 253 if (bpref >= fs->fs_size) 254 bpref = 0; 255 if (bpref == 0) 256 cg = ino_to_cg(fs, ip->i_number); 257 else 258 cg = dtog(fs, bpref); 259 bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg); 260 if (bno > 0) { 261 DIP_ADD(ip, blocks, btodb(size)); 262 ip->i_flag |= IN_CHANGE | IN_UPDATE; 263 *bnp = bno; 264 return (0); 265 } 266 #if defined(QUOTA) || defined(QUOTA2) 267 /* 268 * Restore user's disk quota because allocation failed. 269 */ 270 (void) chkdq(ip, -btodb(size), cred, FORCE); 271 #endif 272 if (flags & B_CONTIG) { 273 /* 274 * XXX ump->um_lock handling is "suspect" at best. 275 * For the case where ffs_hashalloc() fails early 276 * in the B_CONTIG case we reach here with um_lock 277 * already unlocked, so we can't release it again 278 * like in the normal error path. See kern/39206. 279 * 280 * 281 * Fail silently - it's up to our caller to report 282 * errors. 283 */ 284 return (ENOSPC); 285 } 286 nospace: 287 mutex_exit(&ump->um_lock); 288 ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); 289 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 290 return (ENOSPC); 291 } 292 293 /* 294 * Reallocate a fragment to a bigger size 295 * 296 * The number and size of the old block is given, and a preference 297 * and new size is also specified. The allocator attempts to extend 298 * the original block. Failing that, the regular block allocator is 299 * invoked to get an appropriate block. 300 * 301 * => called with um_lock held 302 * => return with um_lock released 303 */ 304 int 305 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize, 306 int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop) 307 { 308 struct ufsmount *ump; 309 struct fs *fs; 310 struct buf *bp; 311 int cg, request, error; 312 daddr_t bprev, bno; 313 314 fs = ip->i_fs; 315 ump = ip->i_ump; 316 317 KASSERT(mutex_owned(&ump->um_lock)); 318 319 #ifdef UVM_PAGE_TRKOWN 320 321 /* 322 * Sanity-check that allocations within the file size 323 * do not allow other threads to read the stale contents 324 * of newly allocated blocks. 325 * Unlike in ffs_alloc(), here pages must always exist 326 * for such allocations, because only the last block of a file 327 * can be a fragment and ffs_write() will reallocate the 328 * fragment to the new size using ufs_balloc_range(), 329 * which always creates pages to cover blocks it allocates. 330 */ 331 332 if (ITOV(ip)->v_type == VREG) { 333 struct vm_page *pg; 334 struct uvm_object *uobj = &ITOV(ip)->v_uobj; 335 voff_t off = trunc_page(ffs_lblktosize(fs, lbprev)); 336 voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize); 337 338 mutex_enter(uobj->vmobjlock); 339 while (off < endoff) { 340 pg = uvm_pagelookup(uobj, off); 341 KASSERT(pg->owner == curproc->p_pid && 342 pg->lowner == curlwp->l_lid); 343 off += PAGE_SIZE; 344 } 345 mutex_exit(uobj->vmobjlock); 346 } 347 #endif 348 349 #ifdef DIAGNOSTIC 350 if ((u_int)osize > fs->fs_bsize || ffs_fragoff(fs, osize) != 0 || 351 (u_int)nsize > fs->fs_bsize || ffs_fragoff(fs, nsize) != 0) { 352 printf( 353 "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n", 354 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 355 fs->fs_fsmnt); 356 panic("ffs_realloccg: bad size"); 357 } 358 if (cred == NOCRED) 359 panic("ffs_realloccg: missing credential"); 360 #endif /* DIAGNOSTIC */ 361 if (freespace(fs, fs->fs_minfree) <= 0 && 362 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 363 NULL, NULL) != 0) { 364 mutex_exit(&ump->um_lock); 365 goto nospace; 366 } 367 if (fs->fs_magic == FS_UFS2_MAGIC) 368 bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs)); 369 else 370 bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs)); 371 372 if (bprev == 0) { 373 printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n", 374 (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, 375 fs->fs_fsmnt); 376 panic("ffs_realloccg: bad bprev"); 377 } 378 mutex_exit(&ump->um_lock); 379 380 /* 381 * Allocate the extra space in the buffer. 382 */ 383 if (bpp != NULL && 384 (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) { 385 return (error); 386 } 387 #if defined(QUOTA) || defined(QUOTA2) 388 if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { 389 if (bpp != NULL) { 390 brelse(bp, 0); 391 } 392 return (error); 393 } 394 #endif 395 /* 396 * Check for extension in the existing location. 397 */ 398 cg = dtog(fs, bprev); 399 mutex_enter(&ump->um_lock); 400 if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { 401 DIP_ADD(ip, blocks, btodb(nsize - osize)); 402 ip->i_flag |= IN_CHANGE | IN_UPDATE; 403 404 if (bpp != NULL) { 405 if (bp->b_blkno != FFS_FSBTODB(fs, bno)) 406 panic("bad blockno"); 407 allocbuf(bp, nsize, 1); 408 memset((char *)bp->b_data + osize, 0, nsize - osize); 409 mutex_enter(bp->b_objlock); 410 KASSERT(!cv_has_waiters(&bp->b_done)); 411 bp->b_oflags |= BO_DONE; 412 mutex_exit(bp->b_objlock); 413 *bpp = bp; 414 } 415 if (blknop != NULL) { 416 *blknop = bno; 417 } 418 return (0); 419 } 420 /* 421 * Allocate a new disk location. 422 */ 423 if (bpref >= fs->fs_size) 424 bpref = 0; 425 switch ((int)fs->fs_optim) { 426 case FS_OPTSPACE: 427 /* 428 * Allocate an exact sized fragment. Although this makes 429 * best use of space, we will waste time relocating it if 430 * the file continues to grow. If the fragmentation is 431 * less than half of the minimum free reserve, we choose 432 * to begin optimizing for time. 433 */ 434 request = nsize; 435 if (fs->fs_minfree < 5 || 436 fs->fs_cstotal.cs_nffree > 437 fs->fs_dsize * fs->fs_minfree / (2 * 100)) 438 break; 439 440 if (ffs_log_changeopt) { 441 log(LOG_NOTICE, 442 "%s: optimization changed from SPACE to TIME\n", 443 fs->fs_fsmnt); 444 } 445 446 fs->fs_optim = FS_OPTTIME; 447 break; 448 case FS_OPTTIME: 449 /* 450 * At this point we have discovered a file that is trying to 451 * grow a small fragment to a larger fragment. To save time, 452 * we allocate a full sized block, then free the unused portion. 453 * If the file continues to grow, the `ffs_fragextend' call 454 * above will be able to grow it in place without further 455 * copying. If aberrant programs cause disk fragmentation to 456 * grow within 2% of the free reserve, we choose to begin 457 * optimizing for space. 458 */ 459 request = fs->fs_bsize; 460 if (fs->fs_cstotal.cs_nffree < 461 fs->fs_dsize * (fs->fs_minfree - 2) / 100) 462 break; 463 464 if (ffs_log_changeopt) { 465 log(LOG_NOTICE, 466 "%s: optimization changed from TIME to SPACE\n", 467 fs->fs_fsmnt); 468 } 469 470 fs->fs_optim = FS_OPTSPACE; 471 break; 472 default: 473 printf("dev = 0x%llx, optim = %d, fs = %s\n", 474 (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt); 475 panic("ffs_realloccg: bad optim"); 476 /* NOTREACHED */ 477 } 478 bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg); 479 if (bno > 0) { 480 if ((ip->i_ump->um_mountp->mnt_wapbl) && 481 (ITOV(ip)->v_type != VREG)) { 482 UFS_WAPBL_REGISTER_DEALLOCATION( 483 ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev), 484 osize); 485 } else { 486 ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, 487 ip->i_number); 488 } 489 if (nsize < request) { 490 if ((ip->i_ump->um_mountp->mnt_wapbl) && 491 (ITOV(ip)->v_type != VREG)) { 492 UFS_WAPBL_REGISTER_DEALLOCATION( 493 ip->i_ump->um_mountp, 494 FFS_FSBTODB(fs, (bno + ffs_numfrags(fs, nsize))), 495 request - nsize); 496 } else 497 ffs_blkfree(fs, ip->i_devvp, 498 bno + ffs_numfrags(fs, nsize), 499 (long)(request - nsize), ip->i_number); 500 } 501 DIP_ADD(ip, blocks, btodb(nsize - osize)); 502 ip->i_flag |= IN_CHANGE | IN_UPDATE; 503 if (bpp != NULL) { 504 bp->b_blkno = FFS_FSBTODB(fs, bno); 505 allocbuf(bp, nsize, 1); 506 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); 507 mutex_enter(bp->b_objlock); 508 KASSERT(!cv_has_waiters(&bp->b_done)); 509 bp->b_oflags |= BO_DONE; 510 mutex_exit(bp->b_objlock); 511 *bpp = bp; 512 } 513 if (blknop != NULL) { 514 *blknop = bno; 515 } 516 return (0); 517 } 518 mutex_exit(&ump->um_lock); 519 520 #if defined(QUOTA) || defined(QUOTA2) 521 /* 522 * Restore user's disk quota because allocation failed. 523 */ 524 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 525 #endif 526 if (bpp != NULL) { 527 brelse(bp, 0); 528 } 529 530 nospace: 531 /* 532 * no space available 533 */ 534 ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); 535 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 536 return (ENOSPC); 537 } 538 539 /* 540 * Allocate an inode in the file system. 541 * 542 * If allocating a directory, use ffs_dirpref to select the inode. 543 * If allocating in a directory, the following hierarchy is followed: 544 * 1) allocate the preferred inode. 545 * 2) allocate an inode in the same cylinder group. 546 * 3) quadradically rehash into other cylinder groups, until an 547 * available inode is located. 548 * If no inode preference is given the following hierarchy is used 549 * to allocate an inode: 550 * 1) allocate an inode in cylinder group 0. 551 * 2) quadradically rehash into other cylinder groups, until an 552 * available inode is located. 553 * 554 * => um_lock not held upon entry or return 555 */ 556 int 557 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, 558 struct vnode **vpp) 559 { 560 struct ufsmount *ump; 561 struct inode *pip; 562 struct fs *fs; 563 struct inode *ip; 564 struct timespec ts; 565 ino_t ino, ipref; 566 int cg, error; 567 568 UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); 569 570 *vpp = NULL; 571 pip = VTOI(pvp); 572 fs = pip->i_fs; 573 ump = pip->i_ump; 574 575 error = UFS_WAPBL_BEGIN(pvp->v_mount); 576 if (error) { 577 return error; 578 } 579 mutex_enter(&ump->um_lock); 580 if (fs->fs_cstotal.cs_nifree == 0) 581 goto noinodes; 582 583 if ((mode & IFMT) == IFDIR) 584 ipref = ffs_dirpref(pip); 585 else 586 ipref = pip->i_number; 587 if (ipref >= fs->fs_ncg * fs->fs_ipg) 588 ipref = 0; 589 cg = ino_to_cg(fs, ipref); 590 /* 591 * Track number of dirs created one after another 592 * in a same cg without intervening by files. 593 */ 594 if ((mode & IFMT) == IFDIR) { 595 if (fs->fs_contigdirs[cg] < 255) 596 fs->fs_contigdirs[cg]++; 597 } else { 598 if (fs->fs_contigdirs[cg] > 0) 599 fs->fs_contigdirs[cg]--; 600 } 601 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg); 602 if (ino == 0) 603 goto noinodes; 604 UFS_WAPBL_END(pvp->v_mount); 605 error = VFS_VGET(pvp->v_mount, ino, vpp); 606 if (error) { 607 int err; 608 err = UFS_WAPBL_BEGIN(pvp->v_mount); 609 if (err == 0) 610 ffs_vfree(pvp, ino, mode); 611 if (err == 0) 612 UFS_WAPBL_END(pvp->v_mount); 613 return (error); 614 } 615 KASSERT((*vpp)->v_type == VNON); 616 ip = VTOI(*vpp); 617 if (ip->i_mode) { 618 #if 0 619 printf("mode = 0%o, inum = %d, fs = %s\n", 620 ip->i_mode, ip->i_number, fs->fs_fsmnt); 621 #else 622 printf("dmode %x mode %x dgen %x gen %x\n", 623 DIP(ip, mode), ip->i_mode, 624 DIP(ip, gen), ip->i_gen); 625 printf("size %llx blocks %llx\n", 626 (long long)DIP(ip, size), (long long)DIP(ip, blocks)); 627 printf("ino %llu ipref %llu\n", (unsigned long long)ino, 628 (unsigned long long)ipref); 629 #if 0 630 error = bread(ump->um_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ino)), 631 (int)fs->fs_bsize, NOCRED, 0, &bp); 632 #endif 633 634 #endif 635 panic("ffs_valloc: dup alloc"); 636 } 637 if (DIP(ip, blocks)) { /* XXX */ 638 printf("free inode %llu on %s had %" PRId64 " blocks\n", 639 (unsigned long long)ino, fs->fs_fsmnt, DIP(ip, blocks)); 640 DIP_ASSIGN(ip, blocks, 0); 641 } 642 ip->i_flag &= ~IN_SPACECOUNTED; 643 ip->i_flags = 0; 644 DIP_ASSIGN(ip, flags, 0); 645 /* 646 * Set up a new generation number for this inode. 647 */ 648 ip->i_gen++; 649 DIP_ASSIGN(ip, gen, ip->i_gen); 650 if (fs->fs_magic == FS_UFS2_MAGIC) { 651 vfs_timestamp(&ts); 652 ip->i_ffs2_birthtime = ts.tv_sec; 653 ip->i_ffs2_birthnsec = ts.tv_nsec; 654 } 655 return (0); 656 noinodes: 657 mutex_exit(&ump->um_lock); 658 UFS_WAPBL_END(pvp->v_mount); 659 ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes"); 660 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); 661 return (ENOSPC); 662 } 663 664 /* 665 * Find a cylinder group in which to place a directory. 666 * 667 * The policy implemented by this algorithm is to allocate a 668 * directory inode in the same cylinder group as its parent 669 * directory, but also to reserve space for its files inodes 670 * and data. Restrict the number of directories which may be 671 * allocated one after another in the same cylinder group 672 * without intervening allocation of files. 673 * 674 * If we allocate a first level directory then force allocation 675 * in another cylinder group. 676 */ 677 static ino_t 678 ffs_dirpref(struct inode *pip) 679 { 680 register struct fs *fs; 681 int cg, prefcg; 682 int64_t dirsize, cgsize, curdsz; 683 int avgifree, avgbfree, avgndir; 684 int minifree, minbfree, maxndir; 685 int mincg, minndir; 686 int maxcontigdirs; 687 688 KASSERT(mutex_owned(&pip->i_ump->um_lock)); 689 690 fs = pip->i_fs; 691 692 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 693 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 694 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 695 696 /* 697 * Force allocation in another cg if creating a first level dir. 698 */ 699 if (ITOV(pip)->v_vflag & VV_ROOT) { 700 prefcg = random() % fs->fs_ncg; 701 mincg = prefcg; 702 minndir = fs->fs_ipg; 703 for (cg = prefcg; cg < fs->fs_ncg; cg++) 704 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 705 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 706 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 707 mincg = cg; 708 minndir = fs->fs_cs(fs, cg).cs_ndir; 709 } 710 for (cg = 0; cg < prefcg; cg++) 711 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 712 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 713 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 714 mincg = cg; 715 minndir = fs->fs_cs(fs, cg).cs_ndir; 716 } 717 return ((ino_t)(fs->fs_ipg * mincg)); 718 } 719 720 /* 721 * Count various limits which used for 722 * optimal allocation of a directory inode. 723 * Try cylinder groups with >75% avgifree and avgbfree. 724 * Avoid cylinder groups with no free blocks or inodes as that 725 * triggers an I/O-expensive cylinder group scan. 726 */ 727 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 728 minifree = avgifree - avgifree / 4; 729 if (minifree < 1) 730 minifree = 1; 731 minbfree = avgbfree - avgbfree / 4; 732 if (minbfree < 1) 733 minbfree = 1; 734 cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; 735 dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; 736 if (avgndir != 0) { 737 curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; 738 if (dirsize < curdsz) 739 dirsize = curdsz; 740 } 741 if (cgsize < dirsize * 255) 742 maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize; 743 else 744 maxcontigdirs = 255; 745 if (fs->fs_avgfpdir > 0) 746 maxcontigdirs = min(maxcontigdirs, 747 fs->fs_ipg / fs->fs_avgfpdir); 748 if (maxcontigdirs == 0) 749 maxcontigdirs = 1; 750 751 /* 752 * Limit number of dirs in one cg and reserve space for 753 * regular files, but only if we have no deficit in 754 * inodes or space. 755 */ 756 prefcg = ino_to_cg(fs, pip->i_number); 757 for (cg = prefcg; cg < fs->fs_ncg; cg++) 758 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 759 fs->fs_cs(fs, cg).cs_nifree >= minifree && 760 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 761 if (fs->fs_contigdirs[cg] < maxcontigdirs) 762 return ((ino_t)(fs->fs_ipg * cg)); 763 } 764 for (cg = 0; cg < prefcg; cg++) 765 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 766 fs->fs_cs(fs, cg).cs_nifree >= minifree && 767 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 768 if (fs->fs_contigdirs[cg] < maxcontigdirs) 769 return ((ino_t)(fs->fs_ipg * cg)); 770 } 771 /* 772 * This is a backstop when we are deficient in space. 773 */ 774 for (cg = prefcg; cg < fs->fs_ncg; cg++) 775 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 776 return ((ino_t)(fs->fs_ipg * cg)); 777 for (cg = 0; cg < prefcg; cg++) 778 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 779 break; 780 return ((ino_t)(fs->fs_ipg * cg)); 781 } 782 783 /* 784 * Select the desired position for the next block in a file. The file is 785 * logically divided into sections. The first section is composed of the 786 * direct blocks. Each additional section contains fs_maxbpg blocks. 787 * 788 * If no blocks have been allocated in the first section, the policy is to 789 * request a block in the same cylinder group as the inode that describes 790 * the file. If no blocks have been allocated in any other section, the 791 * policy is to place the section in a cylinder group with a greater than 792 * average number of free blocks. An appropriate cylinder group is found 793 * by using a rotor that sweeps the cylinder groups. When a new group of 794 * blocks is needed, the sweep begins in the cylinder group following the 795 * cylinder group from which the previous allocation was made. The sweep 796 * continues until a cylinder group with greater than the average number 797 * of free blocks is found. If the allocation is for the first block in an 798 * indirect block, the information on the previous allocation is unavailable; 799 * here a best guess is made based upon the logical block number being 800 * allocated. 801 * 802 * If a section is already partially allocated, the policy is to 803 * contiguously allocate fs_maxcontig blocks. The end of one of these 804 * contiguous blocks and the beginning of the next is laid out 805 * contigously if possible. 806 * 807 * => um_lock held on entry and exit 808 */ 809 daddr_t 810 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, 811 int32_t *bap /* XXX ondisk32 */) 812 { 813 struct fs *fs; 814 int cg; 815 int avgbfree, startcg; 816 817 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 818 819 fs = ip->i_fs; 820 821 /* 822 * If allocating a contiguous file with B_CONTIG, use the hints 823 * in the inode extentions to return the desired block. 824 * 825 * For metadata (indirect blocks) return the address of where 826 * the first indirect block resides - we'll scan for the next 827 * available slot if we need to allocate more than one indirect 828 * block. For data, return the address of the actual block 829 * relative to the address of the first data block. 830 */ 831 if (flags & B_CONTIG) { 832 KASSERT(ip->i_ffs_first_data_blk != 0); 833 KASSERT(ip->i_ffs_first_indir_blk != 0); 834 if (flags & B_METAONLY) 835 return ip->i_ffs_first_indir_blk; 836 else 837 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 838 } 839 840 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 841 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 842 cg = ino_to_cg(fs, ip->i_number); 843 return (cgbase(fs, cg) + fs->fs_frag); 844 } 845 /* 846 * Find a cylinder with greater than average number of 847 * unused data blocks. 848 */ 849 if (indx == 0 || bap[indx - 1] == 0) 850 startcg = 851 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 852 else 853 startcg = dtog(fs, 854 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 855 startcg %= fs->fs_ncg; 856 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 857 for (cg = startcg; cg < fs->fs_ncg; cg++) 858 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 859 return (cgbase(fs, cg) + fs->fs_frag); 860 } 861 for (cg = 0; cg < startcg; cg++) 862 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 863 return (cgbase(fs, cg) + fs->fs_frag); 864 } 865 return (0); 866 } 867 /* 868 * We just always try to lay things out contiguously. 869 */ 870 return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 871 } 872 873 daddr_t 874 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, 875 int64_t *bap) 876 { 877 struct fs *fs; 878 int cg; 879 int avgbfree, startcg; 880 881 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 882 883 fs = ip->i_fs; 884 885 /* 886 * If allocating a contiguous file with B_CONTIG, use the hints 887 * in the inode extentions to return the desired block. 888 * 889 * For metadata (indirect blocks) return the address of where 890 * the first indirect block resides - we'll scan for the next 891 * available slot if we need to allocate more than one indirect 892 * block. For data, return the address of the actual block 893 * relative to the address of the first data block. 894 */ 895 if (flags & B_CONTIG) { 896 KASSERT(ip->i_ffs_first_data_blk != 0); 897 KASSERT(ip->i_ffs_first_indir_blk != 0); 898 if (flags & B_METAONLY) 899 return ip->i_ffs_first_indir_blk; 900 else 901 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 902 } 903 904 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 905 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 906 cg = ino_to_cg(fs, ip->i_number); 907 return (cgbase(fs, cg) + fs->fs_frag); 908 } 909 /* 910 * Find a cylinder with greater than average number of 911 * unused data blocks. 912 */ 913 if (indx == 0 || bap[indx - 1] == 0) 914 startcg = 915 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 916 else 917 startcg = dtog(fs, 918 ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 919 startcg %= fs->fs_ncg; 920 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 921 for (cg = startcg; cg < fs->fs_ncg; cg++) 922 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 923 return (cgbase(fs, cg) + fs->fs_frag); 924 } 925 for (cg = 0; cg < startcg; cg++) 926 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 927 return (cgbase(fs, cg) + fs->fs_frag); 928 } 929 return (0); 930 } 931 /* 932 * We just always try to lay things out contiguously. 933 */ 934 return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 935 } 936 937 938 /* 939 * Implement the cylinder overflow algorithm. 940 * 941 * The policy implemented by this algorithm is: 942 * 1) allocate the block in its requested cylinder group. 943 * 2) quadradically rehash on the cylinder group number. 944 * 3) brute force search for a free block. 945 * 946 * => called with um_lock held 947 * => returns with um_lock released on success, held on failure 948 * (*allocator releases lock on success, retains lock on failure) 949 */ 950 /*VARARGS5*/ 951 static daddr_t 952 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, 953 int size /* size for data blocks, mode for inodes */, 954 int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int)) 955 { 956 struct fs *fs; 957 daddr_t result; 958 int i, icg = cg; 959 960 fs = ip->i_fs; 961 /* 962 * 1: preferred cylinder group 963 */ 964 result = (*allocator)(ip, cg, pref, size, flags); 965 if (result) 966 return (result); 967 968 if (flags & B_CONTIG) 969 return (result); 970 /* 971 * 2: quadratic rehash 972 */ 973 for (i = 1; i < fs->fs_ncg; i *= 2) { 974 cg += i; 975 if (cg >= fs->fs_ncg) 976 cg -= fs->fs_ncg; 977 result = (*allocator)(ip, cg, 0, size, flags); 978 if (result) 979 return (result); 980 } 981 /* 982 * 3: brute force search 983 * Note that we start at i == 2, since 0 was checked initially, 984 * and 1 is always checked in the quadratic rehash. 985 */ 986 cg = (icg + 2) % fs->fs_ncg; 987 for (i = 2; i < fs->fs_ncg; i++) { 988 result = (*allocator)(ip, cg, 0, size, flags); 989 if (result) 990 return (result); 991 cg++; 992 if (cg == fs->fs_ncg) 993 cg = 0; 994 } 995 return (0); 996 } 997 998 /* 999 * Determine whether a fragment can be extended. 1000 * 1001 * Check to see if the necessary fragments are available, and 1002 * if they are, allocate them. 1003 * 1004 * => called with um_lock held 1005 * => returns with um_lock released on success, held on failure 1006 */ 1007 static daddr_t 1008 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize) 1009 { 1010 struct ufsmount *ump; 1011 struct fs *fs; 1012 struct cg *cgp; 1013 struct buf *bp; 1014 daddr_t bno; 1015 int frags, bbase; 1016 int i, error; 1017 u_int8_t *blksfree; 1018 1019 fs = ip->i_fs; 1020 ump = ip->i_ump; 1021 1022 KASSERT(mutex_owned(&ump->um_lock)); 1023 1024 if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize)) 1025 return (0); 1026 frags = ffs_numfrags(fs, nsize); 1027 bbase = ffs_fragnum(fs, bprev); 1028 if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) { 1029 /* cannot extend across a block boundary */ 1030 return (0); 1031 } 1032 mutex_exit(&ump->um_lock); 1033 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1034 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); 1035 if (error) 1036 goto fail; 1037 cgp = (struct cg *)bp->b_data; 1038 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) 1039 goto fail; 1040 cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); 1041 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1042 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1043 cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); 1044 bno = dtogd(fs, bprev); 1045 blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); 1046 for (i = ffs_numfrags(fs, osize); i < frags; i++) 1047 if (isclr(blksfree, bno + i)) 1048 goto fail; 1049 /* 1050 * the current fragment can be extended 1051 * deduct the count on fragment being extended into 1052 * increase the count on the remaining fragment (if any) 1053 * allocate the extended piece 1054 */ 1055 for (i = frags; i < fs->fs_frag - bbase; i++) 1056 if (isclr(blksfree, bno + i)) 1057 break; 1058 ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); 1059 if (i != frags) 1060 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); 1061 mutex_enter(&ump->um_lock); 1062 for (i = ffs_numfrags(fs, osize); i < frags; i++) { 1063 clrbit(blksfree, bno + i); 1064 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); 1065 fs->fs_cstotal.cs_nffree--; 1066 fs->fs_cs(fs, cg).cs_nffree--; 1067 } 1068 fs->fs_fmod = 1; 1069 ACTIVECG_CLR(fs, cg); 1070 mutex_exit(&ump->um_lock); 1071 bdwrite(bp); 1072 return (bprev); 1073 1074 fail: 1075 if (bp != NULL) 1076 brelse(bp, 0); 1077 mutex_enter(&ump->um_lock); 1078 return (0); 1079 } 1080 1081 /* 1082 * Determine whether a block can be allocated. 1083 * 1084 * Check to see if a block of the appropriate size is available, 1085 * and if it is, allocate it. 1086 */ 1087 static daddr_t 1088 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags) 1089 { 1090 struct ufsmount *ump; 1091 struct fs *fs = ip->i_fs; 1092 struct cg *cgp; 1093 struct buf *bp; 1094 int32_t bno; 1095 daddr_t blkno; 1096 int error, frags, allocsiz, i; 1097 u_int8_t *blksfree; 1098 const int needswap = UFS_FSNEEDSWAP(fs); 1099 1100 ump = ip->i_ump; 1101 1102 KASSERT(mutex_owned(&ump->um_lock)); 1103 1104 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1105 return (0); 1106 mutex_exit(&ump->um_lock); 1107 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1108 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); 1109 if (error) 1110 goto fail; 1111 cgp = (struct cg *)bp->b_data; 1112 if (!cg_chkmagic(cgp, needswap) || 1113 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1114 goto fail; 1115 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1116 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1117 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1118 cgp->cg_time = ufs_rw64(time_second, needswap); 1119 if (size == fs->fs_bsize) { 1120 mutex_enter(&ump->um_lock); 1121 blkno = ffs_alloccgblk(ip, bp, bpref, flags); 1122 ACTIVECG_CLR(fs, cg); 1123 mutex_exit(&ump->um_lock); 1124 bdwrite(bp); 1125 return (blkno); 1126 } 1127 /* 1128 * check to see if any fragments are already available 1129 * allocsiz is the size which will be allocated, hacking 1130 * it down to a smaller size if necessary 1131 */ 1132 blksfree = cg_blksfree(cgp, needswap); 1133 frags = ffs_numfrags(fs, size); 1134 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1135 if (cgp->cg_frsum[allocsiz] != 0) 1136 break; 1137 if (allocsiz == fs->fs_frag) { 1138 /* 1139 * no fragments were available, so a block will be 1140 * allocated, and hacked up 1141 */ 1142 if (cgp->cg_cs.cs_nbfree == 0) 1143 goto fail; 1144 mutex_enter(&ump->um_lock); 1145 blkno = ffs_alloccgblk(ip, bp, bpref, flags); 1146 bno = dtogd(fs, blkno); 1147 for (i = frags; i < fs->fs_frag; i++) 1148 setbit(blksfree, bno + i); 1149 i = fs->fs_frag - frags; 1150 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1151 fs->fs_cstotal.cs_nffree += i; 1152 fs->fs_cs(fs, cg).cs_nffree += i; 1153 fs->fs_fmod = 1; 1154 ufs_add32(cgp->cg_frsum[i], 1, needswap); 1155 ACTIVECG_CLR(fs, cg); 1156 mutex_exit(&ump->um_lock); 1157 bdwrite(bp); 1158 return (blkno); 1159 } 1160 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1161 #if 0 1162 /* 1163 * XXX fvdl mapsearch will panic, and never return -1 1164 * also: returning NULL as daddr_t ? 1165 */ 1166 if (bno < 0) 1167 goto fail; 1168 #endif 1169 for (i = 0; i < frags; i++) 1170 clrbit(blksfree, bno + i); 1171 mutex_enter(&ump->um_lock); 1172 ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); 1173 fs->fs_cstotal.cs_nffree -= frags; 1174 fs->fs_cs(fs, cg).cs_nffree -= frags; 1175 fs->fs_fmod = 1; 1176 ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); 1177 if (frags != allocsiz) 1178 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); 1179 blkno = cgbase(fs, cg) + bno; 1180 ACTIVECG_CLR(fs, cg); 1181 mutex_exit(&ump->um_lock); 1182 bdwrite(bp); 1183 return blkno; 1184 1185 fail: 1186 if (bp != NULL) 1187 brelse(bp, 0); 1188 mutex_enter(&ump->um_lock); 1189 return (0); 1190 } 1191 1192 /* 1193 * Allocate a block in a cylinder group. 1194 * 1195 * This algorithm implements the following policy: 1196 * 1) allocate the requested block. 1197 * 2) allocate a rotationally optimal block in the same cylinder. 1198 * 3) allocate the next available block on the block rotor for the 1199 * specified cylinder group. 1200 * Note that this routine only allocates fs_bsize blocks; these 1201 * blocks may be fragmented by the routine that allocates them. 1202 */ 1203 static daddr_t 1204 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags) 1205 { 1206 struct fs *fs = ip->i_fs; 1207 struct cg *cgp; 1208 int cg; 1209 daddr_t blkno; 1210 int32_t bno; 1211 u_int8_t *blksfree; 1212 const int needswap = UFS_FSNEEDSWAP(fs); 1213 1214 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 1215 1216 cgp = (struct cg *)bp->b_data; 1217 blksfree = cg_blksfree(cgp, needswap); 1218 if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { 1219 bpref = ufs_rw32(cgp->cg_rotor, needswap); 1220 } else { 1221 bpref = ffs_blknum(fs, bpref); 1222 bno = dtogd(fs, bpref); 1223 /* 1224 * if the requested block is available, use it 1225 */ 1226 if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno))) 1227 goto gotit; 1228 /* 1229 * if the requested data block isn't available and we are 1230 * trying to allocate a contiguous file, return an error. 1231 */ 1232 if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) 1233 return (0); 1234 } 1235 1236 /* 1237 * Take the next available block in this cylinder group. 1238 */ 1239 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1240 if (bno < 0) 1241 return (0); 1242 cgp->cg_rotor = ufs_rw32(bno, needswap); 1243 gotit: 1244 blkno = ffs_fragstoblks(fs, bno); 1245 ffs_clrblock(fs, blksfree, blkno); 1246 ffs_clusteracct(fs, cgp, blkno, -1); 1247 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1248 fs->fs_cstotal.cs_nbfree--; 1249 fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; 1250 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1251 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1252 int cylno; 1253 cylno = old_cbtocylno(fs, bno); 1254 KASSERT(cylno >= 0); 1255 KASSERT(cylno < fs->fs_old_ncyl); 1256 KASSERT(old_cbtorpos(fs, bno) >= 0); 1257 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); 1258 ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, 1259 needswap); 1260 ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); 1261 } 1262 fs->fs_fmod = 1; 1263 cg = ufs_rw32(cgp->cg_cgx, needswap); 1264 blkno = cgbase(fs, cg) + bno; 1265 return (blkno); 1266 } 1267 1268 /* 1269 * Determine whether an inode can be allocated. 1270 * 1271 * Check to see if an inode is available, and if it is, 1272 * allocate it using the following policy: 1273 * 1) allocate the requested inode. 1274 * 2) allocate the next available inode after the requested 1275 * inode in the specified cylinder group. 1276 */ 1277 static daddr_t 1278 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags) 1279 { 1280 struct ufsmount *ump = ip->i_ump; 1281 struct fs *fs = ip->i_fs; 1282 struct cg *cgp; 1283 struct buf *bp, *ibp; 1284 u_int8_t *inosused; 1285 int error, start, len, loc, map, i; 1286 int32_t initediblk; 1287 daddr_t nalloc; 1288 struct ufs2_dinode *dp2; 1289 const int needswap = UFS_FSNEEDSWAP(fs); 1290 1291 KASSERT(mutex_owned(&ump->um_lock)); 1292 UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); 1293 1294 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1295 return (0); 1296 mutex_exit(&ump->um_lock); 1297 ibp = NULL; 1298 initediblk = -1; 1299 retry: 1300 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1301 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); 1302 if (error) 1303 goto fail; 1304 cgp = (struct cg *)bp->b_data; 1305 if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) 1306 goto fail; 1307 1308 if (ibp != NULL && 1309 initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { 1310 /* Another thread allocated more inodes so we retry the test. */ 1311 brelse(ibp, 0); 1312 ibp = NULL; 1313 } 1314 /* 1315 * Check to see if we need to initialize more inodes. 1316 */ 1317 if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { 1318 initediblk = ufs_rw32(cgp->cg_initediblk, needswap); 1319 nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); 1320 if (nalloc + FFS_INOPB(fs) > initediblk && 1321 initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { 1322 /* 1323 * We have to release the cg buffer here to prevent 1324 * a deadlock when reading the inode block will 1325 * run a copy-on-write that might use this cg. 1326 */ 1327 brelse(bp, 0); 1328 bp = NULL; 1329 error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs, 1330 ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), 1331 FFS_NOBLK, fs->fs_bsize, false, &ibp); 1332 if (error) 1333 goto fail; 1334 goto retry; 1335 } 1336 } 1337 1338 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1339 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1340 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1341 cgp->cg_time = ufs_rw64(time_second, needswap); 1342 inosused = cg_inosused(cgp, needswap); 1343 if (ipref) { 1344 ipref %= fs->fs_ipg; 1345 if (isclr(inosused, ipref)) 1346 goto gotit; 1347 } 1348 start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY; 1349 len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap), 1350 NBBY); 1351 loc = skpc(0xff, len, &inosused[start]); 1352 if (loc == 0) { 1353 len = start + 1; 1354 start = 0; 1355 loc = skpc(0xff, len, &inosused[0]); 1356 if (loc == 0) { 1357 printf("cg = %d, irotor = %d, fs = %s\n", 1358 cg, ufs_rw32(cgp->cg_irotor, needswap), 1359 fs->fs_fsmnt); 1360 panic("ffs_nodealloccg: map corrupted"); 1361 /* NOTREACHED */ 1362 } 1363 } 1364 i = start + len - loc; 1365 map = inosused[i] ^ 0xff; 1366 if (map == 0) { 1367 printf("fs = %s\n", fs->fs_fsmnt); 1368 panic("ffs_nodealloccg: block not in map"); 1369 } 1370 ipref = i * NBBY + ffs(map) - 1; 1371 cgp->cg_irotor = ufs_rw32(ipref, needswap); 1372 gotit: 1373 UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, 1374 mode); 1375 /* 1376 * Check to see if we need to initialize more inodes. 1377 */ 1378 if (ibp != NULL) { 1379 KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); 1380 memset(ibp->b_data, 0, fs->fs_bsize); 1381 dp2 = (struct ufs2_dinode *)(ibp->b_data); 1382 for (i = 0; i < FFS_INOPB(fs); i++) { 1383 /* 1384 * Don't bother to swap, it's supposed to be 1385 * random, after all. 1386 */ 1387 dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; 1388 dp2++; 1389 } 1390 initediblk += FFS_INOPB(fs); 1391 cgp->cg_initediblk = ufs_rw32(initediblk, needswap); 1392 } 1393 1394 mutex_enter(&ump->um_lock); 1395 ACTIVECG_CLR(fs, cg); 1396 setbit(inosused, ipref); 1397 ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); 1398 fs->fs_cstotal.cs_nifree--; 1399 fs->fs_cs(fs, cg).cs_nifree--; 1400 fs->fs_fmod = 1; 1401 if ((mode & IFMT) == IFDIR) { 1402 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); 1403 fs->fs_cstotal.cs_ndir++; 1404 fs->fs_cs(fs, cg).cs_ndir++; 1405 } 1406 mutex_exit(&ump->um_lock); 1407 if (ibp != NULL) { 1408 bwrite(bp); 1409 bawrite(ibp); 1410 } else 1411 bdwrite(bp); 1412 return (cg * fs->fs_ipg + ipref); 1413 fail: 1414 if (bp != NULL) 1415 brelse(bp, 0); 1416 if (ibp != NULL) 1417 brelse(ibp, 0); 1418 mutex_enter(&ump->um_lock); 1419 return (0); 1420 } 1421 1422 /* 1423 * Allocate a block or fragment. 1424 * 1425 * The specified block or fragment is removed from the 1426 * free map, possibly fragmenting a block in the process. 1427 * 1428 * This implementation should mirror fs_blkfree 1429 * 1430 * => um_lock not held on entry or exit 1431 */ 1432 int 1433 ffs_blkalloc(struct inode *ip, daddr_t bno, long size) 1434 { 1435 int error; 1436 1437 error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, 1438 ip->i_dev, ip->i_uid); 1439 if (error) 1440 return error; 1441 1442 return ffs_blkalloc_ump(ip->i_ump, bno, size); 1443 } 1444 1445 int 1446 ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) 1447 { 1448 struct fs *fs = ump->um_fs; 1449 struct cg *cgp; 1450 struct buf *bp; 1451 int32_t fragno, cgbno; 1452 int i, error, cg, blk, frags, bbase; 1453 u_int8_t *blksfree; 1454 const int needswap = UFS_FSNEEDSWAP(fs); 1455 1456 KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 && 1457 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag); 1458 KASSERT(bno < fs->fs_size); 1459 1460 cg = dtog(fs, bno); 1461 error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1462 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); 1463 if (error) { 1464 return error; 1465 } 1466 cgp = (struct cg *)bp->b_data; 1467 if (!cg_chkmagic(cgp, needswap)) { 1468 brelse(bp, 0); 1469 return EIO; 1470 } 1471 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1472 cgp->cg_time = ufs_rw64(time_second, needswap); 1473 cgbno = dtogd(fs, bno); 1474 blksfree = cg_blksfree(cgp, needswap); 1475 1476 mutex_enter(&ump->um_lock); 1477 if (size == fs->fs_bsize) { 1478 fragno = ffs_fragstoblks(fs, cgbno); 1479 if (!ffs_isblock(fs, blksfree, fragno)) { 1480 mutex_exit(&ump->um_lock); 1481 brelse(bp, 0); 1482 return EBUSY; 1483 } 1484 ffs_clrblock(fs, blksfree, fragno); 1485 ffs_clusteracct(fs, cgp, fragno, -1); 1486 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1487 fs->fs_cstotal.cs_nbfree--; 1488 fs->fs_cs(fs, cg).cs_nbfree--; 1489 } else { 1490 bbase = cgbno - ffs_fragnum(fs, cgbno); 1491 1492 frags = ffs_numfrags(fs, size); 1493 for (i = 0; i < frags; i++) { 1494 if (isclr(blksfree, cgbno + i)) { 1495 mutex_exit(&ump->um_lock); 1496 brelse(bp, 0); 1497 return EBUSY; 1498 } 1499 } 1500 /* 1501 * if a complete block is being split, account for it 1502 */ 1503 fragno = ffs_fragstoblks(fs, bbase); 1504 if (ffs_isblock(fs, blksfree, fragno)) { 1505 ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); 1506 fs->fs_cstotal.cs_nffree += fs->fs_frag; 1507 fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; 1508 ffs_clusteracct(fs, cgp, fragno, -1); 1509 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1510 fs->fs_cstotal.cs_nbfree--; 1511 fs->fs_cs(fs, cg).cs_nbfree--; 1512 } 1513 /* 1514 * decrement the counts associated with the old frags 1515 */ 1516 blk = blkmap(fs, blksfree, bbase); 1517 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1518 /* 1519 * allocate the fragment 1520 */ 1521 for (i = 0; i < frags; i++) { 1522 clrbit(blksfree, cgbno + i); 1523 } 1524 ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); 1525 fs->fs_cstotal.cs_nffree -= i; 1526 fs->fs_cs(fs, cg).cs_nffree -= i; 1527 /* 1528 * add back in counts associated with the new frags 1529 */ 1530 blk = blkmap(fs, blksfree, bbase); 1531 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1532 } 1533 fs->fs_fmod = 1; 1534 ACTIVECG_CLR(fs, cg); 1535 mutex_exit(&ump->um_lock); 1536 bdwrite(bp); 1537 return 0; 1538 } 1539 1540 /* 1541 * Free a block or fragment. 1542 * 1543 * The specified block or fragment is placed back in the 1544 * free map. If a fragment is deallocated, a possible 1545 * block reassembly is checked. 1546 * 1547 * => um_lock not held on entry or exit 1548 */ 1549 static void 1550 ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size) 1551 { 1552 struct cg *cgp; 1553 struct buf *bp; 1554 struct ufsmount *ump; 1555 daddr_t cgblkno; 1556 int error, cg; 1557 dev_t dev; 1558 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1559 const int needswap = UFS_FSNEEDSWAP(fs); 1560 1561 KASSERT(!devvp_is_snapshot); 1562 1563 cg = dtog(fs, bno); 1564 dev = devvp->v_rdev; 1565 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1566 KASSERT(fs == ump->um_fs); 1567 cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1568 1569 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1570 NOCRED, B_MODIFY, &bp); 1571 if (error) { 1572 return; 1573 } 1574 cgp = (struct cg *)bp->b_data; 1575 if (!cg_chkmagic(cgp, needswap)) { 1576 brelse(bp, 0); 1577 return; 1578 } 1579 1580 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1581 1582 bdwrite(bp); 1583 } 1584 1585 struct discardopdata { 1586 struct work wk; /* must be first */ 1587 struct vnode *devvp; 1588 daddr_t bno; 1589 long size; 1590 }; 1591 1592 struct discarddata { 1593 struct fs *fs; 1594 struct discardopdata *entry; 1595 long maxsize; 1596 kmutex_t entrylk; 1597 struct workqueue *wq; 1598 int wqcnt, wqdraining; 1599 kmutex_t wqlk; 1600 kcondvar_t wqcv; 1601 /* timer for flush? */ 1602 }; 1603 1604 static void 1605 ffs_blkfree_td(struct fs *fs, struct discardopdata *td) 1606 { 1607 long todo; 1608 1609 while (td->size) { 1610 todo = min(td->size, 1611 ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno)))); 1612 ffs_blkfree_cg(fs, td->devvp, td->bno, todo); 1613 td->bno += ffs_numfrags(fs, todo); 1614 td->size -= todo; 1615 } 1616 } 1617 1618 static void 1619 ffs_discardcb(struct work *wk, void *arg) 1620 { 1621 struct discardopdata *td = (void *)wk; 1622 struct discarddata *ts = arg; 1623 struct fs *fs = ts->fs; 1624 struct disk_discard_range ta; 1625 #ifdef TRIMDEBUG 1626 int error; 1627 #endif 1628 1629 ta.bno = FFS_FSBTODB(fs, td->bno); 1630 ta.size = td->size >> DEV_BSHIFT; 1631 #ifdef TRIMDEBUG 1632 error = 1633 #endif 1634 VOP_IOCTL(td->devvp, DIOCDISCARD, &ta, FWRITE, FSCRED); 1635 #ifdef TRIMDEBUG 1636 printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error); 1637 #endif 1638 1639 ffs_blkfree_td(fs, td); 1640 kmem_free(td, sizeof(*td)); 1641 mutex_enter(&ts->wqlk); 1642 ts->wqcnt--; 1643 if (ts->wqdraining && !ts->wqcnt) 1644 cv_signal(&ts->wqcv); 1645 mutex_exit(&ts->wqlk); 1646 } 1647 1648 void * 1649 ffs_discard_init(struct vnode *devvp, struct fs *fs) 1650 { 1651 struct disk_discard_params tp; 1652 struct discarddata *ts; 1653 int error; 1654 1655 error = VOP_IOCTL(devvp, DIOCGDISCARDPARAMS, &tp, FREAD, FSCRED); 1656 if (error) { 1657 printf("DIOCGDISCARDPARAMS: %d\n", error); 1658 return NULL; 1659 } 1660 if (tp.maxsize * DEV_BSIZE < fs->fs_bsize) { 1661 printf("tp.maxsize=%ld, fs_bsize=%d\n", tp.maxsize, fs->fs_bsize); 1662 return NULL; 1663 } 1664 1665 ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); 1666 error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts, 1667 0, 0, 0); 1668 if (error) { 1669 kmem_free(ts, sizeof (*ts)); 1670 return NULL; 1671 } 1672 mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE); 1673 mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE); 1674 cv_init(&ts->wqcv, "trimwqcv"); 1675 ts->maxsize = max(tp.maxsize * DEV_BSIZE, 100*1024); /* XXX */ 1676 ts->fs = fs; 1677 return ts; 1678 } 1679 1680 void 1681 ffs_discard_finish(void *vts, int flags) 1682 { 1683 struct discarddata *ts = vts; 1684 struct discardopdata *td = NULL; 1685 int res = 0; 1686 1687 /* wait for workqueue to drain */ 1688 mutex_enter(&ts->wqlk); 1689 if (ts->wqcnt) { 1690 ts->wqdraining = 1; 1691 res = cv_timedwait(&ts->wqcv, &ts->wqlk, mstohz(5000)); 1692 } 1693 mutex_exit(&ts->wqlk); 1694 if (res) 1695 printf("ffs_discarddata drain timeout\n"); 1696 1697 mutex_enter(&ts->entrylk); 1698 if (ts->entry) { 1699 td = ts->entry; 1700 ts->entry = NULL; 1701 } 1702 mutex_exit(&ts->entrylk); 1703 if (td) { 1704 /* XXX don't tell disk, its optional */ 1705 ffs_blkfree_td(ts->fs, td); 1706 #ifdef TRIMDEBUG 1707 printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size); 1708 #endif 1709 kmem_free(td, sizeof(*td)); 1710 } 1711 1712 cv_destroy(&ts->wqcv); 1713 mutex_destroy(&ts->entrylk); 1714 mutex_destroy(&ts->wqlk); 1715 workqueue_destroy(ts->wq); 1716 kmem_free(ts, sizeof(*ts)); 1717 } 1718 1719 void 1720 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1721 ino_t inum) 1722 { 1723 struct ufsmount *ump; 1724 int error; 1725 dev_t dev; 1726 struct discarddata *ts; 1727 struct discardopdata *td; 1728 1729 dev = devvp->v_rdev; 1730 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1731 if (ffs_snapblkfree(fs, devvp, bno, size, inum)) 1732 return; 1733 1734 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1735 if (error) 1736 return; 1737 1738 if (!ump->um_discarddata) { 1739 ffs_blkfree_cg(fs, devvp, bno, size); 1740 return; 1741 } 1742 1743 #ifdef TRIMDEBUG 1744 printf("blkfree(%" PRId64 ",%ld)\n", bno, size); 1745 #endif 1746 ts = ump->um_discarddata; 1747 td = NULL; 1748 1749 mutex_enter(&ts->entrylk); 1750 if (ts->entry) { 1751 td = ts->entry; 1752 /* ffs deallocs backwards, check for prepend only */ 1753 if (td->bno == bno + ffs_numfrags(fs, size) 1754 && td->size + size <= ts->maxsize) { 1755 td->bno = bno; 1756 td->size += size; 1757 if (td->size < ts->maxsize) { 1758 #ifdef TRIMDEBUG 1759 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1760 #endif 1761 mutex_exit(&ts->entrylk); 1762 return; 1763 } 1764 size = 0; /* mark done */ 1765 } 1766 ts->entry = NULL; 1767 } 1768 mutex_exit(&ts->entrylk); 1769 1770 if (td) { 1771 #ifdef TRIMDEBUG 1772 printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size); 1773 #endif 1774 mutex_enter(&ts->wqlk); 1775 ts->wqcnt++; 1776 mutex_exit(&ts->wqlk); 1777 workqueue_enqueue(ts->wq, &td->wk, NULL); 1778 } 1779 if (!size) 1780 return; 1781 1782 td = kmem_alloc(sizeof(*td), KM_SLEEP); 1783 td->devvp = devvp; 1784 td->bno = bno; 1785 td->size = size; 1786 1787 if (td->size < ts->maxsize) { /* XXX always the case */ 1788 mutex_enter(&ts->entrylk); 1789 if (!ts->entry) { /* possible race? */ 1790 #ifdef TRIMDEBUG 1791 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1792 #endif 1793 ts->entry = td; 1794 td = NULL; 1795 } 1796 mutex_exit(&ts->entrylk); 1797 } 1798 if (td) { 1799 #ifdef TRIMDEBUG 1800 printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size); 1801 #endif 1802 mutex_enter(&ts->wqlk); 1803 ts->wqcnt++; 1804 mutex_exit(&ts->wqlk); 1805 workqueue_enqueue(ts->wq, &td->wk, NULL); 1806 } 1807 } 1808 1809 /* 1810 * Free a block or fragment from a snapshot cg copy. 1811 * 1812 * The specified block or fragment is placed back in the 1813 * free map. If a fragment is deallocated, a possible 1814 * block reassembly is checked. 1815 * 1816 * => um_lock not held on entry or exit 1817 */ 1818 void 1819 ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1820 ino_t inum) 1821 { 1822 struct cg *cgp; 1823 struct buf *bp; 1824 struct ufsmount *ump; 1825 daddr_t cgblkno; 1826 int error, cg; 1827 dev_t dev; 1828 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1829 const int needswap = UFS_FSNEEDSWAP(fs); 1830 1831 KASSERT(devvp_is_snapshot); 1832 1833 cg = dtog(fs, bno); 1834 dev = VTOI(devvp)->i_devvp->v_rdev; 1835 ump = VFSTOUFS(devvp->v_mount); 1836 cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg)); 1837 1838 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1839 if (error) 1840 return; 1841 1842 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1843 NOCRED, B_MODIFY, &bp); 1844 if (error) { 1845 return; 1846 } 1847 cgp = (struct cg *)bp->b_data; 1848 if (!cg_chkmagic(cgp, needswap)) { 1849 brelse(bp, 0); 1850 return; 1851 } 1852 1853 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1854 1855 bdwrite(bp); 1856 } 1857 1858 static void 1859 ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 1860 struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) 1861 { 1862 struct cg *cgp; 1863 int32_t fragno, cgbno; 1864 int i, cg, blk, frags, bbase; 1865 u_int8_t *blksfree; 1866 const int needswap = UFS_FSNEEDSWAP(fs); 1867 1868 cg = dtog(fs, bno); 1869 cgp = (struct cg *)bp->b_data; 1870 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1871 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1872 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1873 cgp->cg_time = ufs_rw64(time_second, needswap); 1874 cgbno = dtogd(fs, bno); 1875 blksfree = cg_blksfree(cgp, needswap); 1876 mutex_enter(&ump->um_lock); 1877 if (size == fs->fs_bsize) { 1878 fragno = ffs_fragstoblks(fs, cgbno); 1879 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 1880 if (devvp_is_snapshot) { 1881 mutex_exit(&ump->um_lock); 1882 return; 1883 } 1884 printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n", 1885 (unsigned long long)dev, bno, fs->fs_fsmnt); 1886 panic("blkfree: freeing free block"); 1887 } 1888 ffs_setblock(fs, blksfree, fragno); 1889 ffs_clusteracct(fs, cgp, fragno, 1); 1890 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1891 fs->fs_cstotal.cs_nbfree++; 1892 fs->fs_cs(fs, cg).cs_nbfree++; 1893 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1894 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1895 i = old_cbtocylno(fs, cgbno); 1896 KASSERT(i >= 0); 1897 KASSERT(i < fs->fs_old_ncyl); 1898 KASSERT(old_cbtorpos(fs, cgbno) >= 0); 1899 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); 1900 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, 1901 needswap); 1902 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1903 } 1904 } else { 1905 bbase = cgbno - ffs_fragnum(fs, cgbno); 1906 /* 1907 * decrement the counts associated with the old frags 1908 */ 1909 blk = blkmap(fs, blksfree, bbase); 1910 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1911 /* 1912 * deallocate the fragment 1913 */ 1914 frags = ffs_numfrags(fs, size); 1915 for (i = 0; i < frags; i++) { 1916 if (isset(blksfree, cgbno + i)) { 1917 printf("dev = 0x%llx, block = %" PRId64 1918 ", fs = %s\n", 1919 (unsigned long long)dev, bno + i, 1920 fs->fs_fsmnt); 1921 panic("blkfree: freeing free frag"); 1922 } 1923 setbit(blksfree, cgbno + i); 1924 } 1925 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1926 fs->fs_cstotal.cs_nffree += i; 1927 fs->fs_cs(fs, cg).cs_nffree += i; 1928 /* 1929 * add back in counts associated with the new frags 1930 */ 1931 blk = blkmap(fs, blksfree, bbase); 1932 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1933 /* 1934 * if a complete block has been reassembled, account for it 1935 */ 1936 fragno = ffs_fragstoblks(fs, bbase); 1937 if (ffs_isblock(fs, blksfree, fragno)) { 1938 ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); 1939 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 1940 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 1941 ffs_clusteracct(fs, cgp, fragno, 1); 1942 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1943 fs->fs_cstotal.cs_nbfree++; 1944 fs->fs_cs(fs, cg).cs_nbfree++; 1945 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1946 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1947 i = old_cbtocylno(fs, bbase); 1948 KASSERT(i >= 0); 1949 KASSERT(i < fs->fs_old_ncyl); 1950 KASSERT(old_cbtorpos(fs, bbase) >= 0); 1951 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); 1952 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, 1953 bbase)], 1, needswap); 1954 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1955 } 1956 } 1957 } 1958 fs->fs_fmod = 1; 1959 ACTIVECG_CLR(fs, cg); 1960 mutex_exit(&ump->um_lock); 1961 } 1962 1963 /* 1964 * Free an inode. 1965 */ 1966 int 1967 ffs_vfree(struct vnode *vp, ino_t ino, int mode) 1968 { 1969 1970 return ffs_freefile(vp->v_mount, ino, mode); 1971 } 1972 1973 /* 1974 * Do the actual free operation. 1975 * The specified inode is placed back in the free map. 1976 * 1977 * => um_lock not held on entry or exit 1978 */ 1979 int 1980 ffs_freefile(struct mount *mp, ino_t ino, int mode) 1981 { 1982 struct ufsmount *ump = VFSTOUFS(mp); 1983 struct fs *fs = ump->um_fs; 1984 struct vnode *devvp; 1985 struct cg *cgp; 1986 struct buf *bp; 1987 int error, cg; 1988 daddr_t cgbno; 1989 dev_t dev; 1990 const int needswap = UFS_FSNEEDSWAP(fs); 1991 1992 cg = ino_to_cg(fs, ino); 1993 devvp = ump->um_devvp; 1994 dev = devvp->v_rdev; 1995 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1996 1997 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 1998 panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s", 1999 (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); 2000 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2001 NOCRED, B_MODIFY, &bp); 2002 if (error) { 2003 return (error); 2004 } 2005 cgp = (struct cg *)bp->b_data; 2006 if (!cg_chkmagic(cgp, needswap)) { 2007 brelse(bp, 0); 2008 return (0); 2009 } 2010 2011 ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); 2012 2013 bdwrite(bp); 2014 2015 return 0; 2016 } 2017 2018 int 2019 ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) 2020 { 2021 struct ufsmount *ump; 2022 struct cg *cgp; 2023 struct buf *bp; 2024 int error, cg; 2025 daddr_t cgbno; 2026 dev_t dev; 2027 const int needswap = UFS_FSNEEDSWAP(fs); 2028 2029 KASSERT(devvp->v_type != VBLK); 2030 2031 cg = ino_to_cg(fs, ino); 2032 dev = VTOI(devvp)->i_devvp->v_rdev; 2033 ump = VFSTOUFS(devvp->v_mount); 2034 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2035 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2036 panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s", 2037 (unsigned long long)dev, (unsigned long long)ino, 2038 fs->fs_fsmnt); 2039 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2040 NOCRED, B_MODIFY, &bp); 2041 if (error) { 2042 return (error); 2043 } 2044 cgp = (struct cg *)bp->b_data; 2045 if (!cg_chkmagic(cgp, needswap)) { 2046 brelse(bp, 0); 2047 return (0); 2048 } 2049 ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); 2050 2051 bdwrite(bp); 2052 2053 return 0; 2054 } 2055 2056 static void 2057 ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 2058 struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) 2059 { 2060 int cg; 2061 struct cg *cgp; 2062 u_int8_t *inosused; 2063 const int needswap = UFS_FSNEEDSWAP(fs); 2064 2065 cg = ino_to_cg(fs, ino); 2066 cgp = (struct cg *)bp->b_data; 2067 cgp->cg_old_time = ufs_rw32(time_second, needswap); 2068 if ((fs->fs_magic != FS_UFS1_MAGIC) || 2069 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 2070 cgp->cg_time = ufs_rw64(time_second, needswap); 2071 inosused = cg_inosused(cgp, needswap); 2072 ino %= fs->fs_ipg; 2073 if (isclr(inosused, ino)) { 2074 printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", 2075 (unsigned long long)dev, (unsigned long long)ino + 2076 cg * fs->fs_ipg, fs->fs_fsmnt); 2077 if (fs->fs_ronly == 0) 2078 panic("ifree: freeing free inode"); 2079 } 2080 clrbit(inosused, ino); 2081 if (!devvp_is_snapshot) 2082 UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, 2083 ino + cg * fs->fs_ipg, mode); 2084 if (ino < ufs_rw32(cgp->cg_irotor, needswap)) 2085 cgp->cg_irotor = ufs_rw32(ino, needswap); 2086 ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); 2087 mutex_enter(&ump->um_lock); 2088 fs->fs_cstotal.cs_nifree++; 2089 fs->fs_cs(fs, cg).cs_nifree++; 2090 if ((mode & IFMT) == IFDIR) { 2091 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); 2092 fs->fs_cstotal.cs_ndir--; 2093 fs->fs_cs(fs, cg).cs_ndir--; 2094 } 2095 fs->fs_fmod = 1; 2096 ACTIVECG_CLR(fs, cg); 2097 mutex_exit(&ump->um_lock); 2098 } 2099 2100 /* 2101 * Check to see if a file is free. 2102 */ 2103 int 2104 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) 2105 { 2106 struct cg *cgp; 2107 struct buf *bp; 2108 daddr_t cgbno; 2109 int ret, cg; 2110 u_int8_t *inosused; 2111 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 2112 2113 KASSERT(devvp_is_snapshot); 2114 2115 cg = ino_to_cg(fs, ino); 2116 if (devvp_is_snapshot) 2117 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2118 else 2119 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2120 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2121 return 1; 2122 if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) { 2123 return 1; 2124 } 2125 cgp = (struct cg *)bp->b_data; 2126 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { 2127 brelse(bp, 0); 2128 return 1; 2129 } 2130 inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); 2131 ino %= fs->fs_ipg; 2132 ret = isclr(inosused, ino); 2133 brelse(bp, 0); 2134 return ret; 2135 } 2136 2137 /* 2138 * Find a block of the specified size in the specified cylinder group. 2139 * 2140 * It is a panic if a request is made to find a block if none are 2141 * available. 2142 */ 2143 static int32_t 2144 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) 2145 { 2146 int32_t bno; 2147 int start, len, loc, i; 2148 int blk, field, subfield, pos; 2149 int ostart, olen; 2150 u_int8_t *blksfree; 2151 const int needswap = UFS_FSNEEDSWAP(fs); 2152 2153 /* KASSERT(mutex_owned(&ump->um_lock)); */ 2154 2155 /* 2156 * find the fragment by searching through the free block 2157 * map for an appropriate bit pattern 2158 */ 2159 if (bpref) 2160 start = dtogd(fs, bpref) / NBBY; 2161 else 2162 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; 2163 blksfree = cg_blksfree(cgp, needswap); 2164 len = howmany(fs->fs_fpg, NBBY) - start; 2165 ostart = start; 2166 olen = len; 2167 loc = scanc((u_int)len, 2168 (const u_char *)&blksfree[start], 2169 (const u_char *)fragtbl[fs->fs_frag], 2170 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2171 if (loc == 0) { 2172 len = start + 1; 2173 start = 0; 2174 loc = scanc((u_int)len, 2175 (const u_char *)&blksfree[0], 2176 (const u_char *)fragtbl[fs->fs_frag], 2177 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2178 if (loc == 0) { 2179 printf("start = %d, len = %d, fs = %s\n", 2180 ostart, olen, fs->fs_fsmnt); 2181 printf("offset=%d %ld\n", 2182 ufs_rw32(cgp->cg_freeoff, needswap), 2183 (long)blksfree - (long)cgp); 2184 printf("cg %d\n", cgp->cg_cgx); 2185 panic("ffs_alloccg: map corrupted"); 2186 /* NOTREACHED */ 2187 } 2188 } 2189 bno = (start + len - loc) * NBBY; 2190 cgp->cg_frotor = ufs_rw32(bno, needswap); 2191 /* 2192 * found the byte in the map 2193 * sift through the bits to find the selected frag 2194 */ 2195 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2196 blk = blkmap(fs, blksfree, bno); 2197 blk <<= 1; 2198 field = around[allocsiz]; 2199 subfield = inside[allocsiz]; 2200 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2201 if ((blk & field) == subfield) 2202 return (bno + pos); 2203 field <<= 1; 2204 subfield <<= 1; 2205 } 2206 } 2207 printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt); 2208 panic("ffs_alloccg: block not in map"); 2209 /* return (-1); */ 2210 } 2211 2212 /* 2213 * Fserr prints the name of a file system with an error diagnostic. 2214 * 2215 * The form of the error message is: 2216 * fs: error message 2217 */ 2218 static void 2219 ffs_fserr(struct fs *fs, u_int uid, const char *cp) 2220 { 2221 2222 log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", 2223 uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); 2224 } 2225