1 /* $NetBSD: lfs_cleanerd.c,v 1.23 2010/02/16 23:13:13 mlelstv Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * The cleaner daemon for the NetBSD Log-structured File System. 34 * Only tested for use with version 2 LFSs. 35 */ 36 37 #include <sys/syslog.h> 38 #include <sys/param.h> 39 #include <sys/mount.h> 40 #include <sys/stat.h> 41 #include <ufs/ufs/inode.h> 42 #include <ufs/lfs/lfs.h> 43 44 #include <assert.h> 45 #include <err.h> 46 #include <errno.h> 47 #include <fcntl.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <time.h> 53 #include <util.h> 54 55 #include "bufcache.h" 56 #include "vnode.h" 57 #include "lfs_user.h" 58 #include "fdfs.h" 59 #include "cleaner.h" 60 #include "kernelops.h" 61 #include "mount_lfs.h" 62 63 /* 64 * Global variables. 65 */ 66 /* XXX these top few should really be fs-specific */ 67 int use_fs_idle; /* Use fs idle rather than cpu idle time */ 68 int use_bytes; /* Use bytes written rather than segments cleaned */ 69 int load_threshold; /* How idle is idle (CPU idle) */ 70 int atatime; /* How many segments (bytes) to clean at a time */ 71 72 int nfss; /* Number of filesystems monitored by this cleanerd */ 73 struct clfs **fsp; /* Array of extended filesystem structures */ 74 int segwait_timeout; /* Time to wait in lfs_segwait() */ 75 int do_quit; /* Quit after one cleaning loop */ 76 int do_coalesce; /* Coalesce filesystem */ 77 int do_small; /* Use small writes through markv */ 78 char *copylog_filename; /* File to use for fs debugging analysis */ 79 int inval_segment; /* Segment to invalidate */ 80 int stat_report; /* Report statistics for this period of cycles */ 81 int debug; /* Turn on debugging */ 82 struct cleaner_stats { 83 double util_tot; 84 double util_sos; 85 off_t bytes_read; 86 off_t bytes_written; 87 off_t segs_cleaned; 88 off_t segs_empty; 89 off_t segs_error; 90 } cleaner_stats; 91 92 extern u_int32_t cksum(void *, size_t); 93 extern u_int32_t lfs_sb_cksum(struct dlfs *); 94 extern u_int32_t lfs_cksum_part(void *, size_t, u_int32_t); 95 extern int ufs_getlbns(struct lfs *, struct uvnode *, daddr_t, struct indir *, int *); 96 97 /* Compat */ 98 void pwarn(const char *unused, ...) { /* Does nothing */ }; 99 100 /* 101 * Log a message if debugging is turned on. 102 */ 103 void 104 dlog(const char *fmt, ...) 105 { 106 va_list ap; 107 108 if (debug == 0) 109 return; 110 111 va_start(ap, fmt); 112 vsyslog(LOG_DEBUG, fmt, ap); 113 va_end(ap); 114 } 115 116 /* 117 * Remove the specified filesystem from the list, due to its having 118 * become unmounted or other error condition. 119 */ 120 void 121 handle_error(struct clfs **cfsp, int n) 122 { 123 syslog(LOG_NOTICE, "%s: detaching cleaner", cfsp[n]->lfs_fsmnt); 124 free(cfsp[n]); 125 if (n != nfss - 1) 126 cfsp[n] = cfsp[nfss - 1]; 127 --nfss; 128 } 129 130 /* 131 * Reinitialize a filesystem if, e.g., its size changed. 132 */ 133 int 134 reinit_fs(struct clfs *fs) 135 { 136 char fsname[MNAMELEN]; 137 138 strncpy(fsname, (char *)fs->lfs_fsmnt, MNAMELEN); 139 kops.ko_close(fs->clfs_ifilefd); 140 kops.ko_close(fs->clfs_devfd); 141 fd_reclaim(fs->clfs_devvp); 142 fd_reclaim(fs->lfs_ivnode); 143 free(fs->clfs_dev); 144 free(fs->clfs_segtab); 145 free(fs->clfs_segtabp); 146 147 return init_fs(fs, fsname); 148 } 149 150 #ifdef REPAIR_ZERO_FINFO 151 /* 152 * Use fsck's lfs routines to load the Ifile from an unmounted fs. 153 * We interpret "fsname" as the name of the raw disk device. 154 */ 155 int 156 init_unmounted_fs(struct clfs *fs, char *fsname) 157 { 158 struct lfs *disc_fs; 159 int i; 160 161 fs->clfs_dev = fsname; 162 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDWR)) < 0) { 163 syslog(LOG_ERR, "couldn't open device %s read/write", 164 fs->clfs_dev); 165 return -1; 166 } 167 168 disc_fs = lfs_init(fs->clfs_devfd, 0, 0, 0, 0); 169 170 fs->lfs_dlfs = disc_fs->lfs_dlfs; /* Structure copy */ 171 strncpy(fs->lfs_fsmnt, fsname, MNAMELEN); 172 fs->lfs_ivnode = (struct uvnode *)disc_fs->lfs_ivnode; 173 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 174 atatime); 175 176 /* Allocate and clear segtab */ 177 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 178 sizeof(*fs->clfs_segtab)); 179 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 180 sizeof(*fs->clfs_segtabp)); 181 for (i = 0; i < fs->lfs_nseg; i++) { 182 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 183 fs->clfs_segtab[i].flags = 0x0; 184 } 185 syslog(LOG_NOTICE, "%s: unmounted cleaner starting", fsname); 186 187 return 0; 188 } 189 #endif 190 191 /* 192 * Set up the file descriptors, including the Ifile descriptor. 193 * If we can't get the Ifile, this is not an LFS (or the kernel is 194 * too old to support the fcntl). 195 * XXX Merge this and init_unmounted_fs, switching on whether 196 * XXX "fsname" is a dir or a char special device. Should 197 * XXX also be able to read unmounted devices out of fstab, the way 198 * XXX fsck does. 199 */ 200 int 201 init_fs(struct clfs *fs, char *fsname) 202 { 203 struct statvfs sf; 204 int rootfd; 205 int i; 206 void *sbuf; 207 208 /* 209 * Get the raw device from the block device. 210 * XXX this is ugly. Is there a way to discover the raw device 211 * XXX for a given mount point? 212 */ 213 if (kops.ko_statvfs(fsname, &sf, ST_WAIT) < 0) 214 return -1; 215 fs->clfs_dev = malloc(strlen(sf.f_mntfromname) + 2); 216 if (fs->clfs_dev == NULL) { 217 syslog(LOG_ERR, "couldn't malloc device name string: %m"); 218 return -1; 219 } 220 sprintf(fs->clfs_dev, "/dev/r%s", sf.f_mntfromname + 5); 221 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDONLY, 0)) < 0) { 222 syslog(LOG_ERR, "couldn't open device %s for reading", 223 fs->clfs_dev); 224 return -1; 225 } 226 227 /* Find the Ifile and open it */ 228 if ((rootfd = kops.ko_open(fsname, O_RDONLY, 0)) < 0) 229 return -2; 230 if (kops.ko_fcntl(rootfd, LFCNIFILEFH, &fs->clfs_ifilefh) < 0) 231 return -3; 232 if ((fs->clfs_ifilefd = kops.ko_fhopen(&fs->clfs_ifilefh, 233 sizeof(fs->clfs_ifilefh), O_RDONLY)) < 0) 234 return -4; 235 kops.ko_close(rootfd); 236 237 sbuf = malloc(LFS_SBPAD); 238 if (sbuf == NULL) { 239 syslog(LOG_ERR, "couldn't malloc superblock buffer"); 240 return -1; 241 } 242 243 /* Load in the superblock */ 244 if (kops.ko_pread(fs->clfs_devfd, sbuf, LFS_SBPAD, LFS_LABELPAD) < 0) { 245 free(sbuf); 246 return -1; 247 } 248 249 memcpy(&(fs->lfs_dlfs), sbuf, sizeof(struct dlfs)); 250 free(sbuf); 251 252 /* If this is not a version 2 filesystem, complain and exit */ 253 if (fs->lfs_version != 2) { 254 syslog(LOG_ERR, "%s: not a version 2 LFS", fsname); 255 return -1; 256 } 257 258 /* Assume fsname is the mounted name */ 259 strncpy((char *)fs->lfs_fsmnt, fsname, MNAMELEN); 260 261 /* Set up vnodes for Ifile and raw device */ 262 fs->lfs_ivnode = fd_vget(fs->clfs_ifilefd, fs->lfs_bsize, 0, 0); 263 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 264 atatime); 265 266 /* Allocate and clear segtab */ 267 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 268 sizeof(*fs->clfs_segtab)); 269 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 270 sizeof(*fs->clfs_segtabp)); 271 if (fs->clfs_segtab == NULL || fs->clfs_segtabp == NULL) { 272 syslog(LOG_ERR, "%s: couldn't malloc segment table: %m", 273 fs->clfs_dev); 274 return -1; 275 } 276 277 for (i = 0; i < fs->lfs_nseg; i++) { 278 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 279 fs->clfs_segtab[i].flags = 0x0; 280 } 281 282 syslog(LOG_NOTICE, "%s: attaching cleaner", fsname); 283 return 0; 284 } 285 286 /* 287 * Invalidate all the currently held Ifile blocks so they will be 288 * reread when we clean. Check the size while we're at it, and 289 * resize the buffer cache if necessary. 290 */ 291 void 292 reload_ifile(struct clfs *fs) 293 { 294 struct ubuf *bp; 295 struct stat st; 296 int ohashmax; 297 extern int hashmax; 298 299 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) != NULL) { 300 bremfree(bp); 301 buf_destroy(bp); 302 } 303 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_cleanblkhd)) != NULL) { 304 bremfree(bp); 305 buf_destroy(bp); 306 } 307 308 /* If Ifile is larger than buffer cache, rehash */ 309 fstat(fs->clfs_ifilefd, &st); 310 if (st.st_size / fs->lfs_bsize > hashmax) { 311 ohashmax = hashmax; 312 bufrehash(st.st_size / fs->lfs_bsize); 313 dlog("%s: resized buffer hash from %d to %d", 314 fs->lfs_fsmnt, ohashmax, hashmax); 315 } 316 } 317 318 /* 319 * Get IFILE entry for the given inode, store in ifpp. The buffer 320 * which contains that data is returned in bpp, and must be brelse()d 321 * by the caller. 322 */ 323 void 324 lfs_ientry(IFILE **ifpp, struct clfs *fs, ino_t ino, struct ubuf **bpp) 325 { 326 int error; 327 328 error = bread(fs->lfs_ivnode, ino / fs->lfs_ifpb + fs->lfs_cleansz + 329 fs->lfs_segtabsz, fs->lfs_bsize, NOCRED, 0, bpp); 330 if (error) 331 syslog(LOG_ERR, "%s: ientry failed for ino %d", 332 fs->lfs_fsmnt, (int)ino); 333 *ifpp = (IFILE *)(*bpp)->b_data + ino % fs->lfs_ifpb; 334 return; 335 } 336 337 #ifdef TEST_PATTERN 338 /* 339 * Check ROOTINO for file data. The assumption is that we are running 340 * the "twofiles" test with the rest of the filesystem empty. Files 341 * created by "twofiles" match the test pattern, but ROOTINO and the 342 * executable itself (assumed to be inode 3) should not match. 343 */ 344 static void 345 check_test_pattern(BLOCK_INFO *bip) 346 { 347 int j; 348 unsigned char *cp = bip->bi_bp; 349 350 /* Check inode sanity */ 351 if (bip->bi_lbn == LFS_UNUSED_LBN) { 352 assert(((struct ufs1_dinode *)bip->bi_bp)->di_inumber == 353 bip->bi_inode); 354 } 355 356 /* These can have the test pattern and it's all good */ 357 if (bip->bi_inode > 3) 358 return; 359 360 for (j = 0; j < bip->bi_size; j++) { 361 if (cp[j] != (j & 0xff)) 362 break; 363 } 364 assert(j < bip->bi_size); 365 } 366 #endif /* TEST_PATTERN */ 367 368 /* 369 * Parse the partial segment at daddr, adding its information to 370 * bip. Return the address of the next partial segment to read. 371 */ 372 int32_t 373 parse_pseg(struct clfs *fs, daddr_t daddr, BLOCK_INFO **bipp, int *bic) 374 { 375 SEGSUM *ssp; 376 IFILE *ifp; 377 BLOCK_INFO *bip, *nbip; 378 int32_t *iaddrp, idaddr, odaddr; 379 FINFO *fip; 380 struct ubuf *ifbp; 381 struct ufs1_dinode *dip; 382 u_int32_t ck, vers; 383 int fic, inoc, obic; 384 int i; 385 char *cp; 386 387 odaddr = daddr; 388 obic = *bic; 389 bip = *bipp; 390 391 /* 392 * Retrieve the segment header, set up the SEGSUM pointer 393 * as well as the first FINFO and inode address pointer. 394 */ 395 cp = fd_ptrget(fs->clfs_devvp, daddr); 396 ssp = (SEGSUM *)cp; 397 iaddrp = ((int32_t *)(cp + fs->lfs_ibsize)) - 1; 398 fip = (FINFO *)(cp + sizeof(SEGSUM)); 399 400 /* 401 * Check segment header magic and checksum 402 */ 403 if (ssp->ss_magic != SS_MAGIC) { 404 syslog(LOG_WARNING, "%s: sumsum magic number bad at 0x%x:" 405 " read 0x%x, expected 0x%x", fs->lfs_fsmnt, 406 (int32_t)daddr, ssp->ss_magic, SS_MAGIC); 407 return 0x0; 408 } 409 ck = cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); 410 if (ck != ssp->ss_sumsum) { 411 syslog(LOG_WARNING, "%s: sumsum checksum mismatch at 0x%x:" 412 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, 413 (int32_t)daddr, ssp->ss_sumsum, ck); 414 return 0x0; 415 } 416 417 /* Initialize data sum */ 418 ck = 0; 419 420 /* Point daddr at next block after segment summary */ 421 ++daddr; 422 423 /* 424 * Loop over file info and inode pointers. We always move daddr 425 * forward here because we are also computing the data checksum 426 * as we go. 427 */ 428 fic = inoc = 0; 429 while (fic < ssp->ss_nfinfo || inoc < ssp->ss_ninos) { 430 /* 431 * We must have either a file block or an inode block. 432 * If we don't have either one, it's an error. 433 */ 434 if (fic >= ssp->ss_nfinfo && *iaddrp != daddr) { 435 syslog(LOG_WARNING, "%s: bad pseg at %x (seg %d)", 436 fs->lfs_fsmnt, odaddr, dtosn(fs, odaddr)); 437 *bipp = bip; 438 return 0x0; 439 } 440 441 /* 442 * Note each inode from the inode blocks 443 */ 444 if (inoc < ssp->ss_ninos && *iaddrp == daddr) { 445 cp = fd_ptrget(fs->clfs_devvp, daddr); 446 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 447 dip = (struct ufs1_dinode *)cp; 448 for (i = 0; i < fs->lfs_inopb; i++) { 449 if (dip[i].di_inumber == 0) 450 break; 451 452 /* 453 * Check currency before adding it 454 */ 455 #ifndef REPAIR_ZERO_FINFO 456 lfs_ientry(&ifp, fs, dip[i].di_inumber, &ifbp); 457 idaddr = ifp->if_daddr; 458 brelse(ifbp, 0); 459 if (idaddr != daddr) 460 #endif 461 continue; 462 463 /* 464 * A current inode. Add it. 465 */ 466 ++*bic; 467 nbip = (BLOCK_INFO *)realloc(bip, *bic * 468 sizeof(*bip)); 469 if (nbip) 470 bip = nbip; 471 else { 472 --*bic; 473 *bipp = bip; 474 return 0x0; 475 } 476 bip[*bic - 1].bi_inode = dip[i].di_inumber; 477 bip[*bic - 1].bi_lbn = LFS_UNUSED_LBN; 478 bip[*bic - 1].bi_daddr = daddr; 479 bip[*bic - 1].bi_segcreate = ssp->ss_create; 480 bip[*bic - 1].bi_version = dip[i].di_gen; 481 bip[*bic - 1].bi_bp = &(dip[i]); 482 bip[*bic - 1].bi_size = DINODE1_SIZE; 483 } 484 inoc += i; 485 daddr += btofsb(fs, fs->lfs_ibsize); 486 --iaddrp; 487 continue; 488 } 489 490 /* 491 * Note each file block from the finfo blocks 492 */ 493 if (fic >= ssp->ss_nfinfo) 494 continue; 495 496 /* Count this finfo, whether or not we use it */ 497 ++fic; 498 499 /* 500 * If this finfo has nblocks==0, it was written wrong. 501 * Kernels with this problem always wrote this zero-sized 502 * finfo last, so just ignore it. 503 */ 504 if (fip->fi_nblocks == 0) { 505 #ifdef REPAIR_ZERO_FINFO 506 struct ubuf *nbp; 507 SEGSUM *nssp; 508 509 syslog(LOG_WARNING, "fixing short FINFO at %x (seg %d)", 510 odaddr, dtosn(fs, odaddr)); 511 bread(fs->clfs_devvp, odaddr, fs->lfs_fsize, 512 NOCRED, 0, &nbp); 513 nssp = (SEGSUM *)nbp->b_data; 514 --nssp->ss_nfinfo; 515 nssp->ss_sumsum = cksum(&nssp->ss_datasum, 516 fs->lfs_sumsize - sizeof(nssp->ss_sumsum)); 517 bwrite(nbp); 518 #endif 519 syslog(LOG_WARNING, "zero-length FINFO at %x (seg %d)", 520 odaddr, dtosn(fs, odaddr)); 521 continue; 522 } 523 524 /* 525 * Check currency before adding blocks 526 */ 527 #ifdef REPAIR_ZERO_FINFO 528 vers = -1; 529 #else 530 lfs_ientry(&ifp, fs, fip->fi_ino, &ifbp); 531 vers = ifp->if_version; 532 brelse(ifbp, 0); 533 #endif 534 if (vers != fip->fi_version) { 535 size_t size; 536 537 /* Read all the blocks from the data summary */ 538 for (i = 0; i < fip->fi_nblocks; i++) { 539 size = (i == fip->fi_nblocks - 1) ? 540 fip->fi_lastlength : fs->lfs_bsize; 541 cp = fd_ptrget(fs->clfs_devvp, daddr); 542 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 543 daddr += btofsb(fs, size); 544 } 545 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 546 continue; 547 } 548 549 /* Add all the blocks from the finfos (current or not) */ 550 nbip = (BLOCK_INFO *)realloc(bip, (*bic + fip->fi_nblocks) * 551 sizeof(*bip)); 552 if (nbip) 553 bip = nbip; 554 else { 555 *bipp = bip; 556 return 0x0; 557 } 558 559 for (i = 0; i < fip->fi_nblocks; i++) { 560 bip[*bic + i].bi_inode = fip->fi_ino; 561 bip[*bic + i].bi_lbn = fip->fi_blocks[i]; 562 bip[*bic + i].bi_daddr = daddr; 563 bip[*bic + i].bi_segcreate = ssp->ss_create; 564 bip[*bic + i].bi_version = fip->fi_version; 565 bip[*bic + i].bi_size = (i == fip->fi_nblocks - 1) ? 566 fip->fi_lastlength : fs->lfs_bsize; 567 cp = fd_ptrget(fs->clfs_devvp, daddr); 568 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 569 bip[*bic + i].bi_bp = cp; 570 daddr += btofsb(fs, bip[*bic + i].bi_size); 571 572 #ifdef TEST_PATTERN 573 check_test_pattern(bip + *bic + i); /* XXXDEBUG */ 574 #endif 575 } 576 *bic += fip->fi_nblocks; 577 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 578 } 579 580 #ifndef REPAIR_ZERO_FINFO 581 if (ssp->ss_datasum != ck) { 582 syslog(LOG_WARNING, "%s: data checksum bad at 0x%x:" 583 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, odaddr, 584 ssp->ss_datasum, ck); 585 *bic = obic; 586 return 0x0; 587 } 588 #endif 589 590 *bipp = bip; 591 return daddr; 592 } 593 594 static void 595 log_segment_read(struct clfs *fs, int sn) 596 { 597 FILE *fp; 598 char *cp; 599 600 /* 601 * Write the segment read, and its contents, into a log file in 602 * the current directory. We don't need to log the location of 603 * the segment, since that can be inferred from the segments up 604 * to this point (ss_nextseg field of the previously written segment). 605 * 606 * We can use this info later to reconstruct the filesystem at any 607 * given point in time for analysis, by replaying the log forward 608 * indexed by the segment serial numbers; but it is not suitable 609 * for everyday use since the copylog will be simply enormous. 610 */ 611 cp = fd_ptrget(fs->clfs_devvp, sntod(fs, sn)); 612 613 fp = fopen(copylog_filename, "ab"); 614 if (fp != NULL) { 615 if (fwrite(cp, (size_t)fs->lfs_ssize, 1, fp) != 1) { 616 perror("writing segment to copy log"); 617 } 618 } 619 fclose(fp); 620 } 621 622 /* 623 * Read a segment to populate the BLOCK_INFO structures. 624 * Return the number of partial segments read and parsed. 625 */ 626 int 627 load_segment(struct clfs *fs, int sn, BLOCK_INFO **bipp, int *bic) 628 { 629 int32_t daddr; 630 int i, npseg; 631 632 daddr = sntod(fs, sn); 633 if (daddr < btofsb(fs, LFS_LABELPAD)) 634 daddr = btofsb(fs, LFS_LABELPAD); 635 for (i = 0; i < LFS_MAXNUMSB; i++) { 636 if (fs->lfs_sboffs[i] == daddr) { 637 daddr += btofsb(fs, LFS_SBPAD); 638 break; 639 } 640 } 641 642 /* Preload the segment buffer */ 643 if (fd_preload(fs->clfs_devvp, sntod(fs, sn)) < 0) 644 return -1; 645 646 if (copylog_filename) 647 log_segment_read(fs, sn); 648 649 /* Note bytes read for stats */ 650 cleaner_stats.segs_cleaned++; 651 cleaner_stats.bytes_read += fs->lfs_ssize; 652 ++fs->clfs_nactive; 653 654 npseg = 0; 655 while(dtosn(fs, daddr) == sn && 656 dtosn(fs, daddr + btofsb(fs, fs->lfs_bsize)) == sn) { 657 daddr = parse_pseg(fs, daddr, bipp, bic); 658 if (daddr == 0x0) { 659 ++cleaner_stats.segs_error; 660 break; 661 } 662 ++npseg; 663 } 664 665 return npseg; 666 } 667 668 void 669 calc_cb(struct clfs *fs, int sn, struct clfs_seguse *t) 670 { 671 time_t now; 672 int64_t age, benefit, cost; 673 674 time(&now); 675 age = (now < t->lastmod ? 0 : now - t->lastmod); 676 677 /* Under no circumstances clean active or already-clean segments */ 678 if ((t->flags & SEGUSE_ACTIVE) || !(t->flags & SEGUSE_DIRTY)) { 679 t->priority = 0; 680 return; 681 } 682 683 /* 684 * If the segment is empty, there is no reason to clean it. 685 * Clear its error condition, if any, since we are never going to 686 * try to parse this one. 687 */ 688 if (t->nbytes == 0) { 689 t->flags &= ~SEGUSE_ERROR; /* Strip error once empty */ 690 t->priority = 0; 691 return; 692 } 693 694 if (t->flags & SEGUSE_ERROR) { /* No good if not already empty */ 695 /* No benefit */ 696 t->priority = 0; 697 return; 698 } 699 700 if (t->nbytes > fs->lfs_ssize) { 701 /* Another type of error */ 702 syslog(LOG_WARNING, "segment %d: bad seguse count %d", 703 sn, t->nbytes); 704 t->flags |= SEGUSE_ERROR; 705 t->priority = 0; 706 return; 707 } 708 709 /* 710 * The non-degenerate case. Use Rosenblum's cost-benefit algorithm. 711 * Calculate the benefit from cleaning this segment (one segment, 712 * minus fragmentation, dirty blocks and a segment summary block) 713 * and weigh that against the cost (bytes read plus bytes written). 714 * We count the summary headers as "dirty" to avoid cleaning very 715 * old and very full segments. 716 */ 717 benefit = (int64_t)fs->lfs_ssize - t->nbytes - 718 (t->nsums + 1) * fs->lfs_fsize; 719 if (fs->lfs_bsize > fs->lfs_fsize) /* fragmentation */ 720 benefit -= (fs->lfs_bsize / 2); 721 if (benefit <= 0) { 722 t->priority = 0; 723 return; 724 } 725 726 cost = fs->lfs_ssize + t->nbytes; 727 t->priority = (256 * benefit * age) / cost; 728 729 return; 730 } 731 732 /* 733 * Comparator for BLOCK_INFO structures. Anything not in one of the segments 734 * we're looking at sorts higher; after that we sort first by inode number 735 * and then by block number (unsigned, i.e., negative sorts higher) *but* 736 * sort inodes before data blocks. 737 */ 738 static int 739 bi_comparator(const void *va, const void *vb) 740 { 741 const BLOCK_INFO *a, *b; 742 743 a = (const BLOCK_INFO *)va; 744 b = (const BLOCK_INFO *)vb; 745 746 /* Check for out-of-place block */ 747 if (a->bi_segcreate == a->bi_daddr && 748 b->bi_segcreate != b->bi_daddr) 749 return -1; 750 if (a->bi_segcreate != a->bi_daddr && 751 b->bi_segcreate == b->bi_daddr) 752 return 1; 753 if (a->bi_size <= 0 && b->bi_size > 0) 754 return 1; 755 if (b->bi_size <= 0 && a->bi_size > 0) 756 return -1; 757 758 /* Check inode number */ 759 if (a->bi_inode != b->bi_inode) 760 return a->bi_inode - b->bi_inode; 761 762 /* Check lbn */ 763 if (a->bi_lbn == LFS_UNUSED_LBN) /* Inodes sort lower than blocks */ 764 return -1; 765 if (b->bi_lbn == LFS_UNUSED_LBN) 766 return 1; 767 if ((u_int32_t)a->bi_lbn > (u_int32_t)b->bi_lbn) 768 return 1; 769 else 770 return -1; 771 772 return 0; 773 } 774 775 /* 776 * Comparator for sort_segments: cost-benefit equation. 777 */ 778 static int 779 cb_comparator(const void *va, const void *vb) 780 { 781 const struct clfs_seguse *a, *b; 782 783 a = *(const struct clfs_seguse * const *)va; 784 b = *(const struct clfs_seguse * const *)vb; 785 return a->priority > b->priority ? -1 : 1; 786 } 787 788 void 789 toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, int *bic, int *sizep) 790 { 791 int i, r; 792 BLOCK_INFO *bip = *bipp; 793 struct lfs_fcntl_markv /* { 794 BLOCK_INFO *blkiov; 795 int blkcnt; 796 } */ lim; 797 798 if (bic == 0 || bip == NULL) 799 return; 800 801 /* 802 * Kludge: Store the disk address in segcreate so we know which 803 * ones to toss. 804 */ 805 for (i = 0; i < *bic; i++) 806 bip[i].bi_segcreate = bip[i].bi_daddr; 807 808 /* Sort the blocks */ 809 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 810 811 /* Use bmapv to locate the blocks */ 812 lim.blkiov = bip; 813 lim.blkcnt = *bic; 814 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim)) < 0) { 815 syslog(LOG_WARNING, "%s: bmapv returned %d (%m)", 816 fs->lfs_fsmnt, r); 817 return; 818 } 819 820 /* Toss blocks not in this segment */ 821 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 822 823 /* Get rid of stale blocks */ 824 if (sizep) 825 *sizep = 0; 826 for (i = 0; i < *bic; i++) { 827 if (bip[i].bi_segcreate != bip[i].bi_daddr) 828 break; 829 if (sizep) 830 *sizep += bip[i].bi_size; 831 } 832 *bic = i; /* XXX realloc bip? */ 833 *bipp = bip; 834 835 return; 836 } 837 838 /* 839 * Clean a segment and mark it invalid. 840 */ 841 int 842 invalidate_segment(struct clfs *fs, int sn) 843 { 844 BLOCK_INFO *bip; 845 int i, r, bic; 846 off_t nb; 847 double util; 848 struct lfs_fcntl_markv /* { 849 BLOCK_INFO *blkiov; 850 int blkcnt; 851 } */ lim; 852 853 dlog("%s: inval seg %d", fs->lfs_fsmnt, sn); 854 855 bip = NULL; 856 bic = 0; 857 fs->clfs_nactive = 0; 858 if (load_segment(fs, sn, &bip, &bic) <= 0) 859 return -1; 860 toss_old_blocks(fs, &bip, &bic, NULL); 861 862 /* Record statistics */ 863 for (i = nb = 0; i < bic; i++) 864 nb += bip[i].bi_size; 865 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 866 cleaner_stats.util_tot += util; 867 cleaner_stats.util_sos += util * util; 868 cleaner_stats.bytes_written += nb; 869 870 /* 871 * Use markv to move the blocks. 872 */ 873 lim.blkiov = bip; 874 lim.blkcnt = bic; 875 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 876 syslog(LOG_WARNING, "%s: markv returned %d (%m) " 877 "for seg %d", fs->lfs_fsmnt, r, sn); 878 return r; 879 } 880 881 /* 882 * Finally call invalidate to invalidate the segment. 883 */ 884 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNINVAL, &sn)) < 0) { 885 syslog(LOG_WARNING, "%s: inval returned %d (%m) " 886 "for seg %d", fs->lfs_fsmnt, r, sn); 887 return r; 888 } 889 890 return 0; 891 } 892 893 /* 894 * Check to see if the given ino/lbn pair is represented in the BLOCK_INFO 895 * array we are sending to the kernel, or if the kernel will have to add it. 896 * The kernel will only add each such pair once, though, so keep track of 897 * previous requests in a separate "extra" BLOCK_INFO array. Returns 1 898 * if the block needs to be added, 0 if it is already represented. 899 */ 900 static int 901 check_or_add(ino_t ino, int32_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp) 902 { 903 BLOCK_INFO *t, *ebip = *ebipp; 904 int ebic = *ebicp; 905 int k; 906 907 for (k = 0; k < bic; k++) { 908 if (bip[k].bi_inode != ino) 909 break; 910 if (bip[k].bi_lbn == lbn) { 911 return 0; 912 } 913 } 914 915 /* Look on the list of extra blocks, too */ 916 for (k = 0; k < ebic; k++) { 917 if (ebip[k].bi_inode == ino && ebip[k].bi_lbn == lbn) { 918 return 0; 919 } 920 } 921 922 ++ebic; 923 t = realloc(ebip, ebic * sizeof(BLOCK_INFO)); 924 if (t == NULL) 925 return 1; /* Note *ebipc is not updated */ 926 927 ebip = t; 928 ebip[ebic - 1].bi_inode = ino; 929 ebip[ebic - 1].bi_lbn = lbn; 930 931 *ebipp = ebip; 932 *ebicp = ebic; 933 return 1; 934 } 935 936 /* 937 * Look for indirect blocks we will have to write which are not 938 * contained in this collection of blocks. This constitutes 939 * a hidden cleaning cost, since we are unaware of it until we 940 * have already read the segments. Return the total cost, and fill 941 * in *ifc with the part of that cost due to rewriting the Ifile. 942 */ 943 static off_t 944 check_hidden_cost(struct clfs *fs, BLOCK_INFO *bip, int bic, off_t *ifc) 945 { 946 int start; 947 struct indir in[NIADDR + 1]; 948 int num; 949 int i, j, ebic; 950 BLOCK_INFO *ebip; 951 int32_t lbn; 952 953 start = 0; 954 ebip = NULL; 955 ebic = 0; 956 for (i = 0; i < bic; i++) { 957 if (i == 0 || bip[i].bi_inode != bip[start].bi_inode) { 958 start = i; 959 /* 960 * Look for IFILE blocks, unless this is the Ifile. 961 */ 962 if (bip[i].bi_inode != fs->lfs_ifile) { 963 lbn = fs->lfs_cleansz + bip[i].bi_inode / 964 fs->lfs_ifpb; 965 *ifc += check_or_add(fs->lfs_ifile, lbn, 966 bip, bic, &ebip, &ebic); 967 } 968 } 969 if (bip[i].bi_lbn == LFS_UNUSED_LBN) 970 continue; 971 if (bip[i].bi_lbn < NDADDR) 972 continue; 973 974 ufs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num); 975 for (j = 0; j < num; j++) { 976 check_or_add(bip[i].bi_inode, in[j].in_lbn, 977 bip + start, bic - start, &ebip, &ebic); 978 } 979 } 980 return ebic; 981 } 982 983 /* 984 * Select segments to clean, add blocks from these segments to a cleaning 985 * list, and send this list through lfs_markv() to move them to new 986 * locations on disk. 987 */ 988 int 989 clean_fs(struct clfs *fs, CLEANERINFO *cip) 990 { 991 int i, j, ngood, sn, bic, r, npos; 992 int bytes, totbytes; 993 struct ubuf *bp; 994 SEGUSE *sup; 995 static BLOCK_INFO *bip; 996 struct lfs_fcntl_markv /* { 997 BLOCK_INFO *blkiov; 998 int blkcnt; 999 } */ lim; 1000 int mc; 1001 BLOCK_INFO *mbip; 1002 int inc; 1003 off_t nb; 1004 off_t goal; 1005 off_t extra, if_extra; 1006 double util; 1007 1008 /* Read the segment table into our private structure */ 1009 npos = 0; 1010 for (i = 0; i < fs->lfs_nseg; i+= fs->lfs_sepb) { 1011 bread(fs->lfs_ivnode, fs->lfs_cleansz + i / fs->lfs_sepb, 1012 fs->lfs_bsize, NOCRED, 0, &bp); 1013 for (j = 0; j < fs->lfs_sepb && i + j < fs->lfs_nseg; j++) { 1014 sup = ((SEGUSE *)bp->b_data) + j; 1015 fs->clfs_segtab[i + j].nbytes = sup->su_nbytes; 1016 fs->clfs_segtab[i + j].nsums = sup->su_nsums; 1017 fs->clfs_segtab[i + j].lastmod = sup->su_lastmod; 1018 /* Keep error status but renew other flags */ 1019 fs->clfs_segtab[i + j].flags &= SEGUSE_ERROR; 1020 fs->clfs_segtab[i + j].flags |= sup->su_flags; 1021 1022 /* Compute cost-benefit coefficient */ 1023 calc_cb(fs, i + j, fs->clfs_segtab + i + j); 1024 if (fs->clfs_segtab[i + j].priority > 0) 1025 ++npos; 1026 } 1027 brelse(bp, 0); 1028 } 1029 1030 /* Sort segments based on cleanliness, fulness, and condition */ 1031 heapsort(fs->clfs_segtabp, fs->lfs_nseg, sizeof(struct clfs_seguse *), 1032 cb_comparator); 1033 1034 /* If no segment is cleanable, just return */ 1035 if (fs->clfs_segtabp[0]->priority == 0) { 1036 dlog("%s: no segment cleanable", fs->lfs_fsmnt); 1037 return 0; 1038 } 1039 1040 /* Load some segments' blocks into bip */ 1041 bic = 0; 1042 fs->clfs_nactive = 0; 1043 ngood = 0; 1044 if (use_bytes) { 1045 /* Set attainable goal */ 1046 goal = fs->lfs_ssize * atatime; 1047 if (goal > (cip->clean - 1) * fs->lfs_ssize / 2) 1048 goal = MAX((cip->clean - 1) * fs->lfs_ssize, 1049 fs->lfs_ssize) / 2; 1050 1051 dlog("%s: cleaning with goal %" PRId64 1052 " bytes (%d segs clean, %d cleanable)", 1053 fs->lfs_fsmnt, goal, cip->clean, npos); 1054 syslog(LOG_INFO, "%s: cleaning with goal %" PRId64 1055 " bytes (%d segs clean, %d cleanable)", 1056 fs->lfs_fsmnt, goal, cip->clean, npos); 1057 totbytes = 0; 1058 for (i = 0; i < fs->lfs_nseg && totbytes < goal; i++) { 1059 if (fs->clfs_segtabp[i]->priority == 0) 1060 break; 1061 /* Upper bound on number of segments at once */ 1062 if (ngood * fs->lfs_ssize > 4 * goal) 1063 break; 1064 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1065 dlog("%s: add seg %d prio %" PRIu64 1066 " containing %ld bytes", 1067 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority, 1068 fs->clfs_segtabp[i]->nbytes); 1069 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) { 1070 ++ngood; 1071 toss_old_blocks(fs, &bip, &bic, &bytes); 1072 totbytes += bytes; 1073 } else if (r == 0) 1074 fd_release(fs->clfs_devvp); 1075 else 1076 break; 1077 } 1078 } else { 1079 /* Set attainable goal */ 1080 goal = atatime; 1081 if (goal > cip->clean - 1) 1082 goal = MAX(cip->clean - 1, 1); 1083 1084 dlog("%s: cleaning with goal %d segments (%d clean, %d cleanable)", 1085 fs->lfs_fsmnt, (int)goal, cip->clean, npos); 1086 for (i = 0; i < fs->lfs_nseg && ngood < goal; i++) { 1087 if (fs->clfs_segtabp[i]->priority == 0) 1088 break; 1089 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1090 dlog("%s: add seg %d prio %" PRIu64, 1091 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority); 1092 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) 1093 ++ngood; 1094 else if (r == 0) 1095 fd_release(fs->clfs_devvp); 1096 else 1097 break; 1098 } 1099 toss_old_blocks(fs, &bip, &bic, NULL); 1100 } 1101 1102 /* If there is nothing to do, try again later. */ 1103 if (bic == 0) { 1104 dlog("%s: no blocks to clean in %d cleanable segments", 1105 fs->lfs_fsmnt, (int)ngood); 1106 fd_release_all(fs->clfs_devvp); 1107 return 0; 1108 } 1109 1110 /* Record statistics */ 1111 for (i = nb = 0; i < bic; i++) 1112 nb += bip[i].bi_size; 1113 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 1114 cleaner_stats.util_tot += util; 1115 cleaner_stats.util_sos += util * util; 1116 cleaner_stats.bytes_written += nb; 1117 1118 /* 1119 * Check out our blocks to see if there are hidden cleaning costs. 1120 * If there are, we might be cleaning ourselves deeper into a hole 1121 * rather than doing anything useful. 1122 * XXX do something about this. 1123 */ 1124 if_extra = 0; 1125 extra = fs->lfs_bsize * (off_t)check_hidden_cost(fs, bip, bic, &if_extra); 1126 if_extra *= fs->lfs_bsize; 1127 1128 /* 1129 * Use markv to move the blocks. 1130 */ 1131 if (do_small) 1132 inc = MAXPHYS / fs->lfs_bsize - 1; 1133 else 1134 inc = LFS_MARKV_MAXBLKCNT / 2; 1135 for (mc = 0, mbip = bip; mc < bic; mc += inc, mbip += inc) { 1136 lim.blkiov = mbip; 1137 lim.blkcnt = (bic - mc > inc ? inc : bic - mc); 1138 #ifdef TEST_PATTERN 1139 dlog("checking blocks %d-%d", mc, mc + lim.blkcnt - 1); 1140 for (i = 0; i < lim.blkcnt; i++) { 1141 check_test_pattern(mbip + i); 1142 } 1143 #endif /* TEST_PATTERN */ 1144 dlog("sending blocks %d-%d", mc, mc + lim.blkcnt - 1); 1145 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim))<0) { 1146 syslog(LOG_WARNING, "%s: markv returned %d (%m)", 1147 fs->lfs_fsmnt, r); 1148 if (errno != EAGAIN && errno != ESHUTDOWN) { 1149 fd_release_all(fs->clfs_devvp); 1150 return r; 1151 } 1152 } 1153 } 1154 1155 /* 1156 * Report progress (or lack thereof) 1157 */ 1158 syslog(LOG_INFO, "%s: wrote %" PRId64 " dirty + %" 1159 PRId64 " supporting indirect + %" 1160 PRId64 " supporting Ifile = %" 1161 PRId64 " bytes to clean %d segs (%" PRId64 "%% recovery)", 1162 fs->lfs_fsmnt, (int64_t)nb, (int64_t)(extra - if_extra), 1163 (int64_t)if_extra, (int64_t)(nb + extra), ngood, 1164 (ngood ? (int64_t)(100 - (100 * (nb + extra)) / 1165 (ngood * fs->lfs_ssize)) : 1166 (int64_t)0)); 1167 if (nb + extra >= ngood * fs->lfs_ssize) 1168 syslog(LOG_WARNING, "%s: cleaner not making forward progress", 1169 fs->lfs_fsmnt); 1170 1171 /* 1172 * Finally call reclaim to prompt cleaning of the segments. 1173 */ 1174 kops.ko_fcntl(fs->clfs_ifilefd, LFCNRECLAIM, NULL); 1175 1176 fd_release_all(fs->clfs_devvp); 1177 return 0; 1178 } 1179 1180 /* 1181 * Read the cleanerinfo block and apply cleaning policy to determine whether 1182 * the given filesystem needs to be cleaned. Returns 1 if it does, 0 if it 1183 * does not, or -1 on error. 1184 */ 1185 int 1186 needs_cleaning(struct clfs *fs, CLEANERINFO *cip) 1187 { 1188 struct ubuf *bp; 1189 struct stat st; 1190 daddr_t fsb_per_seg, max_free_segs; 1191 time_t now; 1192 double loadavg; 1193 1194 /* If this fs is "on hold", don't clean it. */ 1195 if (fs->clfs_onhold) 1196 return 0; 1197 1198 /* 1199 * Read the cleanerinfo block from the Ifile. We don't want 1200 * the cached information, so invalidate the buffer before 1201 * handing it back. 1202 */ 1203 if (bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp)) { 1204 syslog(LOG_ERR, "%s: can't read inode", fs->lfs_fsmnt); 1205 return -1; 1206 } 1207 *cip = *(CLEANERINFO *)bp->b_data; /* Structure copy */ 1208 brelse(bp, B_INVAL); 1209 cleaner_stats.bytes_read += fs->lfs_bsize; 1210 1211 /* 1212 * If the number of segments changed under us, reinit. 1213 * We don't have to start over from scratch, however, 1214 * since we don't hold any buffers. 1215 */ 1216 if (fs->lfs_nseg != cip->clean + cip->dirty) { 1217 if (reinit_fs(fs) < 0) { 1218 /* The normal case for unmount */ 1219 syslog(LOG_NOTICE, "%s: filesystem unmounted", fs->lfs_fsmnt); 1220 return -1; 1221 } 1222 syslog(LOG_NOTICE, "%s: nsegs changed", fs->lfs_fsmnt); 1223 } 1224 1225 /* Compute theoretical "free segments" maximum based on usage */ 1226 fsb_per_seg = segtod(fs, 1); 1227 max_free_segs = MAX(cip->bfree, 0) / fsb_per_seg + fs->lfs_minfreeseg; 1228 1229 dlog("%s: bfree = %d, avail = %d, clean = %d/%d", 1230 fs->lfs_fsmnt, cip->bfree, cip->avail, cip->clean, fs->lfs_nseg); 1231 1232 /* If the writer is waiting on us, clean it */ 1233 if (cip->clean <= fs->lfs_minfreeseg || 1234 (cip->flags & LFS_CLEANER_MUST_CLEAN)) 1235 return 1; 1236 1237 /* If there are enough segments, don't clean it */ 1238 if (cip->bfree - cip->avail <= fsb_per_seg && 1239 cip->avail > fsb_per_seg) 1240 return 0; 1241 1242 /* If we are in dire straits, clean it */ 1243 if (cip->bfree - cip->avail > fsb_per_seg && 1244 cip->avail <= fsb_per_seg) 1245 return 1; 1246 1247 /* If under busy threshold, clean regardless of load */ 1248 if (cip->clean < max_free_segs * BUSY_LIM) 1249 return 1; 1250 1251 /* Check busy status; clean if idle and under idle limit */ 1252 if (use_fs_idle) { 1253 /* Filesystem idle */ 1254 time(&now); 1255 if (fstat(fs->clfs_ifilefd, &st) < 0) { 1256 syslog(LOG_ERR, "%s: failed to stat ifile", 1257 fs->lfs_fsmnt); 1258 return -1; 1259 } 1260 if (now - st.st_mtime > segwait_timeout && 1261 cip->clean < max_free_segs * IDLE_LIM) 1262 return 1; 1263 } else { 1264 /* CPU idle - use one-minute load avg */ 1265 if (getloadavg(&loadavg, 1) == -1) { 1266 syslog(LOG_ERR, "%s: failed to get load avg", 1267 fs->lfs_fsmnt); 1268 return -1; 1269 } 1270 if (loadavg < load_threshold && 1271 cip->clean < max_free_segs * IDLE_LIM) 1272 return 1; 1273 } 1274 1275 return 0; 1276 } 1277 1278 /* 1279 * Report statistics. If the signal was SIGUSR2, clear the statistics too. 1280 * If the signal was SIGINT, exit. 1281 */ 1282 static void 1283 sig_report(int sig) 1284 { 1285 double avg = 0.0, stddev; 1286 1287 avg = cleaner_stats.util_tot / MAX(cleaner_stats.segs_cleaned, 1.0); 1288 stddev = cleaner_stats.util_sos / MAX(cleaner_stats.segs_cleaned - 1289 avg * avg, 1.0); 1290 syslog(LOG_INFO, "bytes read: %" PRId64, cleaner_stats.bytes_read); 1291 syslog(LOG_INFO, "bytes written: %" PRId64, cleaner_stats.bytes_written); 1292 syslog(LOG_INFO, "segments cleaned: %" PRId64, cleaner_stats.segs_cleaned); 1293 #if 0 1294 /* "Empty segments" is meaningless, since the kernel handles those */ 1295 syslog(LOG_INFO, "empty segments: %" PRId64, cleaner_stats.segs_empty); 1296 #endif 1297 syslog(LOG_INFO, "error segments: %" PRId64, cleaner_stats.segs_error); 1298 syslog(LOG_INFO, "utilization total: %g", cleaner_stats.util_tot); 1299 syslog(LOG_INFO, "utilization sos: %g", cleaner_stats.util_sos); 1300 syslog(LOG_INFO, "utilization avg: %4.2f", avg); 1301 syslog(LOG_INFO, "utilization sdev: %9.6f", stddev); 1302 1303 if (debug) 1304 bufstats(); 1305 1306 if (sig == SIGUSR2) 1307 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1308 if (sig == SIGINT) 1309 exit(0); 1310 } 1311 1312 static void 1313 sig_exit(int sig) 1314 { 1315 exit(0); 1316 } 1317 1318 static void 1319 usage(void) 1320 { 1321 errx(1, "usage: lfs_cleanerd [-bcdfmqs] [-i segnum] [-l load] " 1322 "[-n nsegs] [-r report_freq] [-t timeout] fs_name ..."); 1323 } 1324 1325 #ifndef LFS_CLEANER_AS_LIB 1326 /* 1327 * Main. 1328 */ 1329 int 1330 main(int argc, char **argv) 1331 { 1332 1333 return lfs_cleaner_main(argc, argv); 1334 } 1335 #endif 1336 1337 int 1338 lfs_cleaner_main(int argc, char **argv) 1339 { 1340 int i, opt, error, r, loopcount, nodetach; 1341 struct timeval tv; 1342 CLEANERINFO ci; 1343 #ifndef USE_CLIENT_SERVER 1344 char *cp, *pidname; 1345 #endif 1346 1347 /* 1348 * Set up defaults 1349 */ 1350 atatime = 1; 1351 segwait_timeout = 300; /* Five minutes */ 1352 load_threshold = 0.2; 1353 stat_report = 0; 1354 inval_segment = -1; 1355 copylog_filename = NULL; 1356 nodetach = 0; 1357 1358 /* 1359 * Parse command-line arguments 1360 */ 1361 while ((opt = getopt(argc, argv, "bC:cdDfi:l:mn:qr:st:")) != -1) { 1362 switch (opt) { 1363 case 'b': /* Use bytes written, not segments read */ 1364 use_bytes = 1; 1365 break; 1366 case 'C': /* copy log */ 1367 copylog_filename = optarg; 1368 break; 1369 case 'c': /* Coalesce files */ 1370 do_coalesce++; 1371 break; 1372 case 'd': /* Debug mode. */ 1373 nodetach++; 1374 debug++; 1375 break; 1376 case 'D': /* stay-on-foreground */ 1377 nodetach++; 1378 break; 1379 case 'f': /* Use fs idle time rather than cpu idle */ 1380 use_fs_idle = 1; 1381 break; 1382 case 'i': /* Invalidate this segment */ 1383 inval_segment = atoi(optarg); 1384 break; 1385 case 'l': /* Load below which to clean */ 1386 load_threshold = atof(optarg); 1387 break; 1388 case 'm': /* [compat only] */ 1389 break; 1390 case 'n': /* How many segs to clean at once */ 1391 atatime = atoi(optarg); 1392 break; 1393 case 'q': /* Quit after one run */ 1394 do_quit = 1; 1395 break; 1396 case 'r': /* Report every stat_report segments */ 1397 stat_report = atoi(optarg); 1398 break; 1399 case 's': /* Small writes */ 1400 do_small = 1; 1401 break; 1402 case 't': /* timeout */ 1403 segwait_timeout = atoi(optarg); 1404 break; 1405 default: 1406 usage(); 1407 /* NOTREACHED */ 1408 } 1409 } 1410 argc -= optind; 1411 argv += optind; 1412 1413 if (argc < 1) 1414 usage(); 1415 if (inval_segment >= 0 && argc != 1) { 1416 errx(1, "lfs_cleanerd: may only specify one filesystem when " 1417 "using -i flag"); 1418 } 1419 1420 if (do_coalesce) { 1421 errx(1, "lfs_cleanerd: -c disabled due to reports of file " 1422 "corruption; you may re-enable it by rebuilding the " 1423 "cleaner"); 1424 } 1425 1426 /* 1427 * Set up daemon mode or foreground mode 1428 */ 1429 if (nodetach) { 1430 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID | LOG_PERROR, 1431 LOG_DAEMON); 1432 signal(SIGINT, sig_report); 1433 } else { 1434 if (daemon(0, 0) == -1) 1435 err(1, "lfs_cleanerd: couldn't become a daemon!"); 1436 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID, LOG_DAEMON); 1437 signal(SIGINT, sig_exit); 1438 } 1439 1440 /* 1441 * Look for an already-running master daemon. If there is one, 1442 * send it our filesystems to add to its list and exit. 1443 * If there is none, become the master. 1444 */ 1445 #ifdef USE_CLIENT_SERVER 1446 try_to_become_master(argc, argv); 1447 #else 1448 /* XXX think about this */ 1449 asprintf(&pidname, "lfs_cleanerd:m:%s", argv[0]); 1450 if (pidname == NULL) { 1451 syslog(LOG_ERR, "malloc failed: %m"); 1452 exit(1); 1453 } 1454 for (cp = pidname; cp != NULL; cp = strchr(cp, '/')) 1455 *cp = '|'; 1456 pidfile(pidname); 1457 #endif 1458 1459 /* 1460 * Signals mean daemon should report its statistics 1461 */ 1462 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1463 signal(SIGUSR1, sig_report); 1464 signal(SIGUSR2, sig_report); 1465 1466 /* 1467 * Start up buffer cache. We only use this for the Ifile, 1468 * and we will resize it if necessary, so it can start small. 1469 */ 1470 bufinit(4); 1471 1472 #ifdef REPAIR_ZERO_FINFO 1473 { 1474 BLOCK_INFO *bip = NULL; 1475 int bic = 0; 1476 1477 nfss = 1; 1478 fsp = (struct clfs **)malloc(sizeof(*fsp)); 1479 fsp[0] = (struct clfs *)calloc(1, sizeof(**fsp)); 1480 1481 if (init_unmounted_fs(fsp[0], argv[0]) < 0) { 1482 err(1, "init_unmounted_fs"); 1483 } 1484 dlog("Filesystem has %d segments", fsp[0]->lfs_nseg); 1485 for (i = 0; i < fsp[0]->lfs_nseg; i++) { 1486 load_segment(fsp[0], i, &bip, &bic); 1487 bic = 0; 1488 } 1489 exit(0); 1490 } 1491 #endif 1492 1493 /* 1494 * Initialize cleaning structures, open devices, etc. 1495 */ 1496 nfss = argc; 1497 fsp = (struct clfs **)malloc(nfss * sizeof(*fsp)); 1498 if (fsp == NULL) { 1499 syslog(LOG_ERR, "couldn't allocate fs table: %m"); 1500 exit(1); 1501 } 1502 for (i = 0; i < nfss; i++) { 1503 fsp[i] = (struct clfs *)calloc(1, sizeof(**fsp)); 1504 if ((r = init_fs(fsp[i], argv[i])) < 0) { 1505 syslog(LOG_ERR, "%s: couldn't init: error code %d", 1506 argv[i], r); 1507 handle_error(fsp, i); 1508 --i; /* Do the new #i over again */ 1509 } 1510 } 1511 1512 /* 1513 * If asked to coalesce, do so and exit. 1514 */ 1515 if (do_coalesce) { 1516 for (i = 0; i < nfss; i++) 1517 clean_all_inodes(fsp[i]); 1518 exit(0); 1519 } 1520 1521 /* 1522 * If asked to invalidate a segment, do that and exit. 1523 */ 1524 if (inval_segment >= 0) { 1525 invalidate_segment(fsp[0], inval_segment); 1526 exit(0); 1527 } 1528 1529 /* 1530 * Main cleaning loop. 1531 */ 1532 loopcount = 0; 1533 while (nfss > 0) { 1534 int cleaned_one; 1535 do { 1536 #ifdef USE_CLIENT_SERVER 1537 check_control_socket(); 1538 #endif 1539 cleaned_one = 0; 1540 for (i = 0; i < nfss; i++) { 1541 if ((error = needs_cleaning(fsp[i], &ci)) < 0) { 1542 handle_error(fsp, i); 1543 continue; 1544 } 1545 if (error == 0) /* No need to clean */ 1546 continue; 1547 1548 reload_ifile(fsp[i]); 1549 if (clean_fs(fsp[i], &ci) < 0) { 1550 handle_error(fsp, i); 1551 continue; 1552 } 1553 ++cleaned_one; 1554 } 1555 ++loopcount; 1556 if (stat_report && loopcount % stat_report == 0) 1557 sig_report(0); 1558 if (do_quit) 1559 exit(0); 1560 } while(cleaned_one); 1561 tv.tv_sec = segwait_timeout; 1562 tv.tv_usec = 0; 1563 /* XXX: why couldn't others work if fsp socket is shutdown? */ 1564 error = kops.ko_fcntl(fsp[0]->clfs_ifilefd,LFCNSEGWAITALL,&tv); 1565 if (error) { 1566 if (errno == ESHUTDOWN) { 1567 for (i = 0; i < nfss; i++) { 1568 handle_error(fsp, i); 1569 assert(nfss == 0); 1570 } 1571 } else 1572 err(1, "LFCNSEGWAITALL"); 1573 } 1574 } 1575 1576 /* NOTREACHED */ 1577 return 0; 1578 } 1579