1 /* $NetBSD: lfs_cleanerd.c,v 1.17 2009/03/16 00:08:10 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * The cleaner daemon for the NetBSD Log-structured File System. 34 * Only tested for use with version 2 LFSs. 35 */ 36 37 #include <sys/syslog.h> 38 #include <sys/param.h> 39 #include <sys/mount.h> 40 #include <sys/stat.h> 41 #include <ufs/ufs/inode.h> 42 #include <ufs/lfs/lfs.h> 43 44 #include <assert.h> 45 #include <err.h> 46 #include <errno.h> 47 #include <fcntl.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <time.h> 53 #include <util.h> 54 55 #include "bufcache.h" 56 #include "vnode.h" 57 #include "lfs_user.h" 58 #include "fdfs.h" 59 #include "cleaner.h" 60 61 /* 62 * Global variables. 63 */ 64 /* XXX these top few should really be fs-specific */ 65 int use_fs_idle; /* Use fs idle rather than cpu idle time */ 66 int use_bytes; /* Use bytes written rather than segments cleaned */ 67 int load_threshold; /* How idle is idle (CPU idle) */ 68 int atatime; /* How many segments (bytes) to clean at a time */ 69 70 int nfss; /* Number of filesystems monitored by this cleanerd */ 71 struct clfs **fsp; /* Array of extended filesystem structures */ 72 int segwait_timeout; /* Time to wait in lfs_segwait() */ 73 int do_quit; /* Quit after one cleaning loop */ 74 int do_coalesce; /* Coalesce filesystem */ 75 int do_small; /* Use small writes through markv */ 76 char *copylog_filename; /* File to use for fs debugging analysis */ 77 int inval_segment; /* Segment to invalidate */ 78 int stat_report; /* Report statistics for this period of cycles */ 79 int debug; /* Turn on debugging */ 80 struct cleaner_stats { 81 double util_tot; 82 double util_sos; 83 off_t bytes_read; 84 off_t bytes_written; 85 off_t segs_cleaned; 86 off_t segs_empty; 87 off_t segs_error; 88 } cleaner_stats; 89 90 extern u_int32_t cksum(void *, size_t); 91 extern u_int32_t lfs_sb_cksum(struct dlfs *); 92 extern u_int32_t lfs_cksum_part(void *, size_t, u_int32_t); 93 extern int ufs_getlbns(struct lfs *, struct uvnode *, daddr_t, struct indir *, int *); 94 95 /* Compat */ 96 void pwarn(const char *unused, ...) { /* Does nothing */ }; 97 98 /* 99 * Log a message if debugging is turned on. 100 */ 101 void 102 dlog(const char *fmt, ...) 103 { 104 va_list ap; 105 106 if (debug == 0) 107 return; 108 109 va_start(ap, fmt); 110 vsyslog(LOG_DEBUG, fmt, ap); 111 va_end(ap); 112 } 113 114 /* 115 * Remove the specified filesystem from the list, due to its having 116 * become unmounted or other error condition. 117 */ 118 void 119 handle_error(struct clfs **cfsp, int n) 120 { 121 syslog(LOG_NOTICE, "%s: detaching cleaner", cfsp[n]->lfs_fsmnt); 122 free(cfsp[n]); 123 if (n != nfss - 1) 124 cfsp[n] = cfsp[nfss - 1]; 125 --nfss; 126 } 127 128 /* 129 * Reinitialize a filesystem if, e.g., its size changed. 130 */ 131 int 132 reinit_fs(struct clfs *fs) 133 { 134 char fsname[MNAMELEN]; 135 136 strncpy(fsname, (char *)fs->lfs_fsmnt, MNAMELEN); 137 close(fs->clfs_ifilefd); 138 close(fs->clfs_devfd); 139 fd_reclaim(fs->clfs_devvp); 140 fd_reclaim(fs->lfs_ivnode); 141 free(fs->clfs_dev); 142 free(fs->clfs_segtab); 143 free(fs->clfs_segtabp); 144 145 return init_fs(fs, fsname); 146 } 147 148 #ifdef REPAIR_ZERO_FINFO 149 /* 150 * Use fsck's lfs routines to load the Ifile from an unmounted fs. 151 * We interpret "fsname" as the name of the raw disk device. 152 */ 153 int 154 init_unmounted_fs(struct clfs *fs, char *fsname) 155 { 156 struct lfs *disc_fs; 157 int i; 158 159 fs->clfs_dev = fsname; 160 if ((fs->clfs_devfd = open(fs->clfs_dev, O_RDWR)) < 0) { 161 syslog(LOG_ERR, "couldn't open device %s read/write", 162 fs->clfs_dev); 163 return -1; 164 } 165 166 disc_fs = lfs_init(fs->clfs_devfd, 0, 0, 0, 0); 167 168 fs->lfs_dlfs = disc_fs->lfs_dlfs; /* Structure copy */ 169 strncpy(fs->lfs_fsmnt, fsname, MNAMELEN); 170 fs->lfs_ivnode = (struct uvnode *)disc_fs->lfs_ivnode; 171 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 172 atatime); 173 174 /* Allocate and clear segtab */ 175 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 176 sizeof(*fs->clfs_segtab)); 177 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 178 sizeof(*fs->clfs_segtabp)); 179 for (i = 0; i < fs->lfs_nseg; i++) { 180 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 181 fs->clfs_segtab[i].flags = 0x0; 182 } 183 syslog(LOG_NOTICE, "%s: unmounted cleaner starting", fsname); 184 185 return 0; 186 } 187 #endif 188 189 /* 190 * Set up the file descriptors, including the Ifile descriptor. 191 * If we can't get the Ifile, this is not an LFS (or the kernel is 192 * too old to support the fcntl). 193 * XXX Merge this and init_unmounted_fs, switching on whether 194 * XXX "fsname" is a dir or a char special device. Should 195 * XXX also be able to read unmounted devices out of fstab, the way 196 * XXX fsck does. 197 */ 198 int 199 init_fs(struct clfs *fs, char *fsname) 200 { 201 struct statvfs sf; 202 int rootfd; 203 int i; 204 205 /* 206 * Get the raw device from the block device. 207 * XXX this is ugly. Is there a way to discover the raw device 208 * XXX for a given mount point? 209 */ 210 if (statvfs(fsname, &sf) < 0) 211 return -1; 212 fs->clfs_dev = malloc(strlen(sf.f_mntfromname) + 2); 213 if (fs->clfs_dev == NULL) { 214 syslog(LOG_ERR, "couldn't malloc device name string: %m"); 215 return -1; 216 } 217 sprintf(fs->clfs_dev, "/dev/r%s", sf.f_mntfromname + 5); 218 if ((fs->clfs_devfd = open(fs->clfs_dev, O_RDONLY)) < 0) { 219 syslog(LOG_ERR, "couldn't open device %s for reading", 220 fs->clfs_dev); 221 return -1; 222 } 223 224 /* Find the Ifile and open it */ 225 if ((rootfd = open(fsname, O_RDONLY)) < 0) 226 return -2; 227 if (fcntl(rootfd, LFCNIFILEFH, &fs->clfs_ifilefh) < 0) 228 return -3; 229 if ((fs->clfs_ifilefd = fhopen(&fs->clfs_ifilefh, 230 sizeof(fs->clfs_ifilefh), O_RDONLY)) < 0) 231 return -4; 232 close(rootfd); 233 234 /* Load in the superblock */ 235 if (pread(fs->clfs_devfd, &(fs->lfs_dlfs), sizeof(struct dlfs), 236 LFS_LABELPAD) < 0) 237 return -1; 238 239 /* If this is not a version 2 filesystem, complain and exit */ 240 if (fs->lfs_version != 2) { 241 syslog(LOG_ERR, "%s: not a version 2 LFS", fsname); 242 return -1; 243 } 244 245 /* Assume fsname is the mounted name */ 246 strncpy((char *)fs->lfs_fsmnt, fsname, MNAMELEN); 247 248 /* Set up vnodes for Ifile and raw device */ 249 fs->lfs_ivnode = fd_vget(fs->clfs_ifilefd, fs->lfs_bsize, 0, 0); 250 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 251 atatime); 252 253 /* Allocate and clear segtab */ 254 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 255 sizeof(*fs->clfs_segtab)); 256 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 257 sizeof(*fs->clfs_segtabp)); 258 if (fs->clfs_segtab == NULL || fs->clfs_segtabp == NULL) { 259 syslog(LOG_ERR, "%s: couldn't malloc segment table: %m", 260 fs->clfs_dev); 261 return -1; 262 } 263 264 for (i = 0; i < fs->lfs_nseg; i++) { 265 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 266 fs->clfs_segtab[i].flags = 0x0; 267 } 268 269 syslog(LOG_NOTICE, "%s: attaching cleaner", fsname); 270 return 0; 271 } 272 273 /* 274 * Invalidate all the currently held Ifile blocks so they will be 275 * reread when we clean. Check the size while we're at it, and 276 * resize the buffer cache if necessary. 277 */ 278 void 279 reload_ifile(struct clfs *fs) 280 { 281 struct ubuf *bp; 282 struct stat st; 283 int ohashmax; 284 extern int hashmax; 285 286 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) != NULL) { 287 bremfree(bp); 288 buf_destroy(bp); 289 } 290 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_cleanblkhd)) != NULL) { 291 bremfree(bp); 292 buf_destroy(bp); 293 } 294 295 /* If Ifile is larger than buffer cache, rehash */ 296 fstat(fs->clfs_ifilefd, &st); 297 if (st.st_size / fs->lfs_bsize > hashmax) { 298 ohashmax = hashmax; 299 bufrehash(st.st_size / fs->lfs_bsize); 300 dlog("%s: resized buffer hash from %d to %d", 301 fs->lfs_fsmnt, ohashmax, hashmax); 302 } 303 } 304 305 /* 306 * Get IFILE entry for the given inode, store in ifpp. The buffer 307 * which contains that data is returned in bpp, and must be brelse()d 308 * by the caller. 309 */ 310 void 311 lfs_ientry(IFILE **ifpp, struct clfs *fs, ino_t ino, struct ubuf **bpp) 312 { 313 int error; 314 315 error = bread(fs->lfs_ivnode, ino / fs->lfs_ifpb + fs->lfs_cleansz + 316 fs->lfs_segtabsz, fs->lfs_bsize, NOCRED, 0, bpp); 317 if (error) 318 syslog(LOG_ERR, "%s: ientry failed for ino %d", 319 fs->lfs_fsmnt, (int)ino); 320 *ifpp = (IFILE *)(*bpp)->b_data + ino % fs->lfs_ifpb; 321 return; 322 } 323 324 #ifdef TEST_PATTERN 325 /* 326 * Check ROOTINO for file data. The assumption is that we are running 327 * the "twofiles" test with the rest of the filesystem empty. Files 328 * created by "twofiles" match the test pattern, but ROOTINO and the 329 * executable itself (assumed to be inode 3) should not match. 330 */ 331 static void 332 check_test_pattern(BLOCK_INFO *bip) 333 { 334 int j; 335 unsigned char *cp = bip->bi_bp; 336 337 /* Check inode sanity */ 338 if (bip->bi_lbn == LFS_UNUSED_LBN) { 339 assert(((struct ufs1_dinode *)bip->bi_bp)->di_inumber == 340 bip->bi_inode); 341 } 342 343 /* These can have the test pattern and it's all good */ 344 if (bip->bi_inode > 3) 345 return; 346 347 for (j = 0; j < bip->bi_size; j++) { 348 if (cp[j] != (j & 0xff)) 349 break; 350 } 351 assert(j < bip->bi_size); 352 } 353 #endif /* TEST_PATTERN */ 354 355 /* 356 * Parse the partial segment at daddr, adding its information to 357 * bip. Return the address of the next partial segment to read. 358 */ 359 int32_t 360 parse_pseg(struct clfs *fs, daddr_t daddr, BLOCK_INFO **bipp, int *bic) 361 { 362 SEGSUM *ssp; 363 IFILE *ifp; 364 BLOCK_INFO *bip, *nbip; 365 int32_t *iaddrp, idaddr, odaddr; 366 FINFO *fip; 367 struct ubuf *ifbp; 368 struct ufs1_dinode *dip; 369 u_int32_t ck, vers; 370 int fic, inoc, obic; 371 int i; 372 char *cp; 373 374 odaddr = daddr; 375 obic = *bic; 376 bip = *bipp; 377 378 /* 379 * Retrieve the segment header, set up the SEGSUM pointer 380 * as well as the first FINFO and inode address pointer. 381 */ 382 cp = fd_ptrget(fs->clfs_devvp, daddr); 383 ssp = (SEGSUM *)cp; 384 iaddrp = ((int32_t *)(cp + fs->lfs_ibsize)) - 1; 385 fip = (FINFO *)(cp + sizeof(SEGSUM)); 386 387 /* 388 * Check segment header magic and checksum 389 */ 390 if (ssp->ss_magic != SS_MAGIC) { 391 syslog(LOG_WARNING, "%s: sumsum magic number bad at 0x%x:" 392 " read 0x%x, expected 0x%x", fs->lfs_fsmnt, 393 (int32_t)daddr, ssp->ss_magic, SS_MAGIC); 394 return 0x0; 395 } 396 ck = cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); 397 if (ck != ssp->ss_sumsum) { 398 syslog(LOG_WARNING, "%s: sumsum checksum mismatch at 0x%x:" 399 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, 400 (int32_t)daddr, ssp->ss_sumsum, ck); 401 return 0x0; 402 } 403 404 /* Initialize data sum */ 405 ck = 0; 406 407 /* Point daddr at next block after segment summary */ 408 ++daddr; 409 410 /* 411 * Loop over file info and inode pointers. We always move daddr 412 * forward here because we are also computing the data checksum 413 * as we go. 414 */ 415 fic = inoc = 0; 416 while (fic < ssp->ss_nfinfo || inoc < ssp->ss_ninos) { 417 /* 418 * We must have either a file block or an inode block. 419 * If we don't have either one, it's an error. 420 */ 421 if (fic >= ssp->ss_nfinfo && *iaddrp != daddr) { 422 syslog(LOG_WARNING, "%s: bad pseg at %x (seg %d)", 423 fs->lfs_fsmnt, odaddr, dtosn(fs, odaddr)); 424 *bipp = bip; 425 return 0x0; 426 } 427 428 /* 429 * Note each inode from the inode blocks 430 */ 431 if (inoc < ssp->ss_ninos && *iaddrp == daddr) { 432 cp = fd_ptrget(fs->clfs_devvp, daddr); 433 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 434 dip = (struct ufs1_dinode *)cp; 435 for (i = 0; i < fs->lfs_inopb; i++) { 436 if (dip[i].di_inumber == 0) 437 break; 438 439 /* 440 * Check currency before adding it 441 */ 442 #ifndef REPAIR_ZERO_FINFO 443 lfs_ientry(&ifp, fs, dip[i].di_inumber, &ifbp); 444 idaddr = ifp->if_daddr; 445 brelse(ifbp, 0); 446 if (idaddr != daddr) 447 #endif 448 continue; 449 450 /* 451 * A current inode. Add it. 452 */ 453 ++*bic; 454 nbip = (BLOCK_INFO *)realloc(bip, *bic * 455 sizeof(*bip)); 456 if (nbip) 457 bip = nbip; 458 else { 459 --*bic; 460 *bipp = bip; 461 return 0x0; 462 } 463 bip[*bic - 1].bi_inode = dip[i].di_inumber; 464 bip[*bic - 1].bi_lbn = LFS_UNUSED_LBN; 465 bip[*bic - 1].bi_daddr = daddr; 466 bip[*bic - 1].bi_segcreate = ssp->ss_create; 467 bip[*bic - 1].bi_version = dip[i].di_gen; 468 bip[*bic - 1].bi_bp = &(dip[i]); 469 bip[*bic - 1].bi_size = DINODE1_SIZE; 470 } 471 inoc += i; 472 daddr += btofsb(fs, fs->lfs_ibsize); 473 --iaddrp; 474 continue; 475 } 476 477 /* 478 * Note each file block from the finfo blocks 479 */ 480 if (fic >= ssp->ss_nfinfo) 481 continue; 482 483 /* Count this finfo, whether or not we use it */ 484 ++fic; 485 486 /* 487 * If this finfo has nblocks==0, it was written wrong. 488 * Kernels with this problem always wrote this zero-sized 489 * finfo last, so just ignore it. 490 */ 491 if (fip->fi_nblocks == 0) { 492 #ifdef REPAIR_ZERO_FINFO 493 struct ubuf *nbp; 494 SEGSUM *nssp; 495 496 syslog(LOG_WARNING, "fixing short FINFO at %x (seg %d)", 497 odaddr, dtosn(fs, odaddr)); 498 bread(fs->clfs_devvp, odaddr, fs->lfs_fsize, 499 NOCRED, 0, &nbp); 500 nssp = (SEGSUM *)nbp->b_data; 501 --nssp->ss_nfinfo; 502 nssp->ss_sumsum = cksum(&nssp->ss_datasum, 503 fs->lfs_sumsize - sizeof(nssp->ss_sumsum)); 504 bwrite(nbp); 505 #endif 506 syslog(LOG_WARNING, "zero-length FINFO at %x (seg %d)", 507 odaddr, dtosn(fs, odaddr)); 508 continue; 509 } 510 511 /* 512 * Check currency before adding blocks 513 */ 514 #ifdef REPAIR_ZERO_FINFO 515 vers = -1; 516 #else 517 lfs_ientry(&ifp, fs, fip->fi_ino, &ifbp); 518 vers = ifp->if_version; 519 brelse(ifbp, 0); 520 #endif 521 if (vers != fip->fi_version) { 522 size_t size; 523 524 /* Read all the blocks from the data summary */ 525 for (i = 0; i < fip->fi_nblocks; i++) { 526 size = (i == fip->fi_nblocks - 1) ? 527 fip->fi_lastlength : fs->lfs_bsize; 528 cp = fd_ptrget(fs->clfs_devvp, daddr); 529 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 530 daddr += btofsb(fs, size); 531 } 532 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 533 continue; 534 } 535 536 /* Add all the blocks from the finfos (current or not) */ 537 nbip = (BLOCK_INFO *)realloc(bip, (*bic + fip->fi_nblocks) * 538 sizeof(*bip)); 539 if (nbip) 540 bip = nbip; 541 else { 542 *bipp = bip; 543 return 0x0; 544 } 545 546 for (i = 0; i < fip->fi_nblocks; i++) { 547 bip[*bic + i].bi_inode = fip->fi_ino; 548 bip[*bic + i].bi_lbn = fip->fi_blocks[i]; 549 bip[*bic + i].bi_daddr = daddr; 550 bip[*bic + i].bi_segcreate = ssp->ss_create; 551 bip[*bic + i].bi_version = fip->fi_version; 552 bip[*bic + i].bi_size = (i == fip->fi_nblocks - 1) ? 553 fip->fi_lastlength : fs->lfs_bsize; 554 cp = fd_ptrget(fs->clfs_devvp, daddr); 555 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 556 bip[*bic + i].bi_bp = cp; 557 daddr += btofsb(fs, bip[*bic + i].bi_size); 558 559 #ifdef TEST_PATTERN 560 check_test_pattern(bip + *bic + i); /* XXXDEBUG */ 561 #endif 562 } 563 *bic += fip->fi_nblocks; 564 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 565 } 566 567 #ifndef REPAIR_ZERO_FINFO 568 if (ssp->ss_datasum != ck) { 569 syslog(LOG_WARNING, "%s: data checksum bad at 0x%x:" 570 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, odaddr, 571 ssp->ss_datasum, ck); 572 *bic = obic; 573 return 0x0; 574 } 575 #endif 576 577 *bipp = bip; 578 return daddr; 579 } 580 581 static void 582 log_segment_read(struct clfs *fs, int sn) 583 { 584 FILE *fp; 585 char *cp; 586 587 /* 588 * Write the segment read, and its contents, into a log file in 589 * the current directory. We don't need to log the location of 590 * the segment, since that can be inferred from the segments up 591 * to this point (ss_nextseg field of the previously written segment). 592 * 593 * We can use this info later to reconstruct the filesystem at any 594 * given point in time for analysis, by replaying the log forward 595 * indexed by the segment serial numbers; but it is not suitable 596 * for everyday use since the copylog will be simply enormous. 597 */ 598 cp = fd_ptrget(fs->clfs_devvp, sntod(fs, sn)); 599 600 fp = fopen(copylog_filename, "ab"); 601 if (fp != NULL) { 602 if (fwrite(cp, (size_t)fs->lfs_ssize, 1, fp) != 1) { 603 perror("writing segment to copy log"); 604 } 605 } 606 fclose(fp); 607 } 608 609 /* 610 * Read a segment to populate the BLOCK_INFO structures. 611 * Return the number of partial segments read and parsed. 612 */ 613 int 614 load_segment(struct clfs *fs, int sn, BLOCK_INFO **bipp, int *bic) 615 { 616 int32_t daddr; 617 int i, npseg; 618 619 daddr = sntod(fs, sn); 620 if (daddr < btofsb(fs, LFS_LABELPAD)) 621 daddr = btofsb(fs, LFS_LABELPAD); 622 for (i = 0; i < LFS_MAXNUMSB; i++) { 623 if (fs->lfs_sboffs[i] == daddr) { 624 daddr += btofsb(fs, LFS_SBPAD); 625 break; 626 } 627 } 628 629 /* Preload the segment buffer */ 630 if (fd_preload(fs->clfs_devvp, sntod(fs, sn)) < 0) 631 return -1; 632 633 if (copylog_filename) 634 log_segment_read(fs, sn); 635 636 /* Note bytes read for stats */ 637 cleaner_stats.segs_cleaned++; 638 cleaner_stats.bytes_read += fs->lfs_ssize; 639 ++fs->clfs_nactive; 640 641 npseg = 0; 642 while(dtosn(fs, daddr) == sn && 643 dtosn(fs, daddr + btofsb(fs, fs->lfs_bsize)) == sn) { 644 daddr = parse_pseg(fs, daddr, bipp, bic); 645 if (daddr == 0x0) { 646 ++cleaner_stats.segs_error; 647 break; 648 } 649 ++npseg; 650 } 651 652 return npseg; 653 } 654 655 void 656 calc_cb(struct clfs *fs, int sn, struct clfs_seguse *t) 657 { 658 time_t now; 659 int64_t age, benefit, cost; 660 661 time(&now); 662 age = (now < t->lastmod ? 0 : now - t->lastmod); 663 664 /* Under no circumstances clean active or already-clean segments */ 665 if ((t->flags & SEGUSE_ACTIVE) || !(t->flags & SEGUSE_DIRTY)) { 666 t->priority = 0; 667 return; 668 } 669 670 /* 671 * If the segment is empty, there is no reason to clean it. 672 * Clear its error condition, if any, since we are never going to 673 * try to parse this one. 674 */ 675 if (t->nbytes == 0) { 676 t->flags &= ~SEGUSE_ERROR; /* Strip error once empty */ 677 t->priority = 0; 678 return; 679 } 680 681 if (t->flags & SEGUSE_ERROR) { /* No good if not already empty */ 682 /* No benefit */ 683 t->priority = 0; 684 return; 685 } 686 687 if (t->nbytes > fs->lfs_ssize) { 688 /* Another type of error */ 689 syslog(LOG_WARNING, "segment %d: bad seguse count %d", 690 sn, t->nbytes); 691 t->flags |= SEGUSE_ERROR; 692 t->priority = 0; 693 return; 694 } 695 696 /* 697 * The non-degenerate case. Use Rosenblum's cost-benefit algorithm. 698 * Calculate the benefit from cleaning this segment (one segment, 699 * minus fragmentation, dirty blocks and a segment summary block) 700 * and weigh that against the cost (bytes read plus bytes written). 701 * We count the summary headers as "dirty" to avoid cleaning very 702 * old and very full segments. 703 */ 704 benefit = (int64_t)fs->lfs_ssize - t->nbytes - 705 (t->nsums + 1) * fs->lfs_fsize; 706 if (fs->lfs_bsize > fs->lfs_fsize) /* fragmentation */ 707 benefit -= (fs->lfs_bsize / 2); 708 if (benefit <= 0) { 709 t->priority = 0; 710 return; 711 } 712 713 cost = fs->lfs_ssize + t->nbytes; 714 t->priority = (256 * benefit * age) / cost; 715 716 return; 717 } 718 719 /* 720 * Comparator for BLOCK_INFO structures. Anything not in one of the segments 721 * we're looking at sorts higher; after that we sort first by inode number 722 * and then by block number (unsigned, i.e., negative sorts higher) *but* 723 * sort inodes before data blocks. 724 */ 725 static int 726 bi_comparator(const void *va, const void *vb) 727 { 728 const BLOCK_INFO *a, *b; 729 730 a = (const BLOCK_INFO *)va; 731 b = (const BLOCK_INFO *)vb; 732 733 /* Check for out-of-place block */ 734 if (a->bi_segcreate == a->bi_daddr && 735 b->bi_segcreate != b->bi_daddr) 736 return -1; 737 if (a->bi_segcreate != a->bi_daddr && 738 b->bi_segcreate == b->bi_daddr) 739 return 1; 740 if (a->bi_size <= 0 && b->bi_size > 0) 741 return 1; 742 if (b->bi_size <= 0 && a->bi_size > 0) 743 return -1; 744 745 /* Check inode number */ 746 if (a->bi_inode != b->bi_inode) 747 return a->bi_inode - b->bi_inode; 748 749 /* Check lbn */ 750 if (a->bi_lbn == LFS_UNUSED_LBN) /* Inodes sort lower than blocks */ 751 return -1; 752 if (b->bi_lbn == LFS_UNUSED_LBN) 753 return 1; 754 if ((u_int32_t)a->bi_lbn > (u_int32_t)b->bi_lbn) 755 return 1; 756 else 757 return -1; 758 759 return 0; 760 } 761 762 /* 763 * Comparator for sort_segments: cost-benefit equation. 764 */ 765 static int 766 cb_comparator(const void *va, const void *vb) 767 { 768 const struct clfs_seguse *a, *b; 769 770 a = *(const struct clfs_seguse * const *)va; 771 b = *(const struct clfs_seguse * const *)vb; 772 return a->priority > b->priority ? -1 : 1; 773 } 774 775 void 776 toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, int *bic, int *sizep) 777 { 778 int i, r; 779 BLOCK_INFO *bip = *bipp; 780 struct lfs_fcntl_markv /* { 781 BLOCK_INFO *blkiov; 782 int blkcnt; 783 } */ lim; 784 785 if (bic == 0 || bip == NULL) 786 return; 787 788 /* 789 * Kludge: Store the disk address in segcreate so we know which 790 * ones to toss. 791 */ 792 for (i = 0; i < *bic; i++) 793 bip[i].bi_segcreate = bip[i].bi_daddr; 794 795 /* Sort the blocks */ 796 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 797 798 /* Use bmapv to locate the blocks */ 799 lim.blkiov = bip; 800 lim.blkcnt = *bic; 801 if ((r = fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim)) < 0) { 802 syslog(LOG_WARNING, "%s: bmapv returned %d (%m)", 803 fs->lfs_fsmnt, r); 804 return; 805 } 806 807 /* Toss blocks not in this segment */ 808 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 809 810 /* Get rid of stale blocks */ 811 if (sizep) 812 *sizep = 0; 813 for (i = 0; i < *bic; i++) { 814 if (bip[i].bi_segcreate != bip[i].bi_daddr) 815 break; 816 if (sizep) 817 *sizep += bip[i].bi_size; 818 } 819 *bic = i; /* XXX realloc bip? */ 820 *bipp = bip; 821 822 return; 823 } 824 825 /* 826 * Clean a segment and mark it invalid. 827 */ 828 int 829 invalidate_segment(struct clfs *fs, int sn) 830 { 831 BLOCK_INFO *bip; 832 int i, r, bic; 833 off_t nb; 834 double util; 835 struct lfs_fcntl_markv /* { 836 BLOCK_INFO *blkiov; 837 int blkcnt; 838 } */ lim; 839 840 dlog("%s: inval seg %d", fs->lfs_fsmnt, sn); 841 842 bip = NULL; 843 bic = 0; 844 fs->clfs_nactive = 0; 845 if (load_segment(fs, sn, &bip, &bic) <= 0) 846 return -1; 847 toss_old_blocks(fs, &bip, &bic, NULL); 848 849 /* Record statistics */ 850 for (i = nb = 0; i < bic; i++) 851 nb += bip[i].bi_size; 852 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 853 cleaner_stats.util_tot += util; 854 cleaner_stats.util_sos += util * util; 855 cleaner_stats.bytes_written += nb; 856 857 /* 858 * Use markv to move the blocks. 859 */ 860 lim.blkiov = bip; 861 lim.blkcnt = bic; 862 if ((r = fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 863 syslog(LOG_WARNING, "%s: markv returned %d (%m) " 864 "for seg %d", fs->lfs_fsmnt, r, sn); 865 return r; 866 } 867 868 /* 869 * Finally call invalidate to invalidate the segment. 870 */ 871 if ((r = fcntl(fs->clfs_ifilefd, LFCNINVAL, &sn)) < 0) { 872 syslog(LOG_WARNING, "%s: inval returned %d (%m) " 873 "for seg %d", fs->lfs_fsmnt, r, sn); 874 return r; 875 } 876 877 return 0; 878 } 879 880 /* 881 * Check to see if the given ino/lbn pair is represented in the BLOCK_INFO 882 * array we are sending to the kernel, or if the kernel will have to add it. 883 * The kernel will only add each such pair once, though, so keep track of 884 * previous requests in a separate "extra" BLOCK_INFO array. Returns 1 885 * if the block needs to be added, 0 if it is already represented. 886 */ 887 static int 888 check_or_add(ino_t ino, int32_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp) 889 { 890 BLOCK_INFO *t, *ebip = *ebipp; 891 int ebic = *ebicp; 892 int k; 893 894 for (k = 0; k < bic; k++) { 895 if (bip[k].bi_inode != ino) 896 break; 897 if (bip[k].bi_lbn == lbn) { 898 return 0; 899 } 900 } 901 902 /* Look on the list of extra blocks, too */ 903 for (k = 0; k < ebic; k++) { 904 if (ebip[k].bi_inode == ino && ebip[k].bi_lbn == lbn) { 905 return 0; 906 } 907 } 908 909 ++ebic; 910 t = realloc(ebip, ebic * sizeof(BLOCK_INFO)); 911 if (t == NULL) 912 return 1; /* Note *ebipc is not updated */ 913 914 ebip = t; 915 ebip[ebic - 1].bi_inode = ino; 916 ebip[ebic - 1].bi_lbn = lbn; 917 918 *ebipp = ebip; 919 *ebicp = ebic; 920 return 1; 921 } 922 923 /* 924 * Look for indirect blocks we will have to write which are not 925 * contained in this collection of blocks. This constitutes 926 * a hidden cleaning cost, since we are unaware of it until we 927 * have already read the segments. Return the total cost, and fill 928 * in *ifc with the part of that cost due to rewriting the Ifile. 929 */ 930 static off_t 931 check_hidden_cost(struct clfs *fs, BLOCK_INFO *bip, int bic, off_t *ifc) 932 { 933 int start; 934 struct indir in[NIADDR + 1]; 935 int num; 936 int i, j, ebic; 937 BLOCK_INFO *ebip; 938 int32_t lbn; 939 940 start = 0; 941 ebip = NULL; 942 ebic = 0; 943 for (i = 0; i < bic; i++) { 944 if (i == 0 || bip[i].bi_inode != bip[start].bi_inode) { 945 start = i; 946 /* 947 * Look for IFILE blocks, unless this is the Ifile. 948 */ 949 if (bip[i].bi_inode != fs->lfs_ifile) { 950 lbn = fs->lfs_cleansz + bip[i].bi_inode / 951 fs->lfs_ifpb; 952 *ifc += check_or_add(fs->lfs_ifile, lbn, 953 bip, bic, &ebip, &ebic); 954 } 955 } 956 if (bip[i].bi_lbn == LFS_UNUSED_LBN) 957 continue; 958 if (bip[i].bi_lbn < NDADDR) 959 continue; 960 961 ufs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num); 962 for (j = 0; j < num; j++) { 963 check_or_add(bip[i].bi_inode, in[j].in_lbn, 964 bip + start, bic - start, &ebip, &ebic); 965 } 966 } 967 return ebic; 968 } 969 970 /* 971 * Select segments to clean, add blocks from these segments to a cleaning 972 * list, and send this list through lfs_markv() to move them to new 973 * locations on disk. 974 */ 975 int 976 clean_fs(struct clfs *fs, CLEANERINFO *cip) 977 { 978 int i, j, ngood, sn, bic, r, npos; 979 int bytes, totbytes; 980 struct ubuf *bp; 981 SEGUSE *sup; 982 static BLOCK_INFO *bip; 983 struct lfs_fcntl_markv /* { 984 BLOCK_INFO *blkiov; 985 int blkcnt; 986 } */ lim; 987 int mc; 988 BLOCK_INFO *mbip; 989 int inc; 990 off_t nb; 991 off_t goal; 992 off_t extra, if_extra; 993 double util; 994 995 /* Read the segment table into our private structure */ 996 npos = 0; 997 for (i = 0; i < fs->lfs_nseg; i+= fs->lfs_sepb) { 998 bread(fs->lfs_ivnode, fs->lfs_cleansz + i / fs->lfs_sepb, 999 fs->lfs_bsize, NOCRED, 0, &bp); 1000 for (j = 0; j < fs->lfs_sepb && i + j < fs->lfs_nseg; j++) { 1001 sup = ((SEGUSE *)bp->b_data) + j; 1002 fs->clfs_segtab[i + j].nbytes = sup->su_nbytes; 1003 fs->clfs_segtab[i + j].nsums = sup->su_nsums; 1004 fs->clfs_segtab[i + j].lastmod = sup->su_lastmod; 1005 /* Keep error status but renew other flags */ 1006 fs->clfs_segtab[i + j].flags &= SEGUSE_ERROR; 1007 fs->clfs_segtab[i + j].flags |= sup->su_flags; 1008 1009 /* Compute cost-benefit coefficient */ 1010 calc_cb(fs, i + j, fs->clfs_segtab + i + j); 1011 if (fs->clfs_segtab[i + j].priority > 0) 1012 ++npos; 1013 } 1014 brelse(bp, 0); 1015 } 1016 1017 /* Sort segments based on cleanliness, fulness, and condition */ 1018 heapsort(fs->clfs_segtabp, fs->lfs_nseg, sizeof(struct clfs_seguse *), 1019 cb_comparator); 1020 1021 /* If no segment is cleanable, just return */ 1022 if (fs->clfs_segtabp[0]->priority == 0) { 1023 dlog("%s: no segment cleanable", fs->lfs_fsmnt); 1024 return 0; 1025 } 1026 1027 /* Load some segments' blocks into bip */ 1028 bic = 0; 1029 fs->clfs_nactive = 0; 1030 ngood = 0; 1031 if (use_bytes) { 1032 /* Set attainable goal */ 1033 goal = fs->lfs_ssize * atatime; 1034 if (goal > (cip->clean - 1) * fs->lfs_ssize / 2) 1035 goal = MAX((cip->clean - 1) * fs->lfs_ssize, 1036 fs->lfs_ssize) / 2; 1037 1038 dlog("%s: cleaning with goal %" PRId64 1039 " bytes (%d segs clean, %d cleanable)", 1040 fs->lfs_fsmnt, goal, cip->clean, npos); 1041 syslog(LOG_INFO, "%s: cleaning with goal %" PRId64 1042 " bytes (%d segs clean, %d cleanable)", 1043 fs->lfs_fsmnt, goal, cip->clean, npos); 1044 totbytes = 0; 1045 for (i = 0; i < fs->lfs_nseg && totbytes < goal; i++) { 1046 if (fs->clfs_segtabp[i]->priority == 0) 1047 break; 1048 /* Upper bound on number of segments at once */ 1049 if (ngood * fs->lfs_ssize > 4 * goal) 1050 break; 1051 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1052 dlog("%s: add seg %d prio %" PRIu64 1053 " containing %ld bytes", 1054 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority, 1055 fs->clfs_segtabp[i]->nbytes); 1056 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) { 1057 ++ngood; 1058 toss_old_blocks(fs, &bip, &bic, &bytes); 1059 totbytes += bytes; 1060 } else if (r == 0) 1061 fd_release(fs->clfs_devvp); 1062 else 1063 break; 1064 } 1065 } else { 1066 /* Set attainable goal */ 1067 goal = atatime; 1068 if (goal > cip->clean - 1) 1069 goal = MAX(cip->clean - 1, 1); 1070 1071 dlog("%s: cleaning with goal %d segments (%d clean, %d cleanable)", 1072 fs->lfs_fsmnt, (int)goal, cip->clean, npos); 1073 for (i = 0; i < fs->lfs_nseg && ngood < goal; i++) { 1074 if (fs->clfs_segtabp[i]->priority == 0) 1075 break; 1076 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1077 dlog("%s: add seg %d prio %" PRIu64, 1078 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority); 1079 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) 1080 ++ngood; 1081 else if (r == 0) 1082 fd_release(fs->clfs_devvp); 1083 else 1084 break; 1085 } 1086 toss_old_blocks(fs, &bip, &bic, NULL); 1087 } 1088 1089 /* If there is nothing to do, try again later. */ 1090 if (bic == 0) { 1091 dlog("%s: no blocks to clean in %d cleanable segments", 1092 fs->lfs_fsmnt, (int)ngood); 1093 fd_release_all(fs->clfs_devvp); 1094 return 0; 1095 } 1096 1097 /* Record statistics */ 1098 for (i = nb = 0; i < bic; i++) 1099 nb += bip[i].bi_size; 1100 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 1101 cleaner_stats.util_tot += util; 1102 cleaner_stats.util_sos += util * util; 1103 cleaner_stats.bytes_written += nb; 1104 1105 /* 1106 * Check out our blocks to see if there are hidden cleaning costs. 1107 * If there are, we might be cleaning ourselves deeper into a hole 1108 * rather than doing anything useful. 1109 * XXX do something about this. 1110 */ 1111 if_extra = 0; 1112 extra = fs->lfs_bsize * (off_t)check_hidden_cost(fs, bip, bic, &if_extra); 1113 if_extra *= fs->lfs_bsize; 1114 1115 /* 1116 * Use markv to move the blocks. 1117 */ 1118 if (do_small) 1119 inc = MAXPHYS / fs->lfs_bsize - 1; 1120 else 1121 inc = LFS_MARKV_MAXBLKCNT / 2; 1122 for (mc = 0, mbip = bip; mc < bic; mc += inc, mbip += inc) { 1123 lim.blkiov = mbip; 1124 lim.blkcnt = (bic - mc > inc ? inc : bic - mc); 1125 #ifdef TEST_PATTERN 1126 dlog("checking blocks %d-%d", mc, mc + lim.blkcnt - 1); 1127 for (i = 0; i < lim.blkcnt; i++) { 1128 check_test_pattern(mbip + i); 1129 } 1130 #endif /* TEST_PATTERN */ 1131 dlog("sending blocks %d-%d", mc, mc + lim.blkcnt - 1); 1132 if ((r = fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 1133 syslog(LOG_WARNING, "%s: markv returned %d (%m)", 1134 fs->lfs_fsmnt, r); 1135 if (errno != EAGAIN && errno != ESHUTDOWN) { 1136 fd_release_all(fs->clfs_devvp); 1137 return r; 1138 } 1139 } 1140 } 1141 1142 /* 1143 * Report progress (or lack thereof) 1144 */ 1145 syslog(LOG_INFO, "%s: wrote %" PRId64 " dirty + %" 1146 PRId64 " supporting indirect + %" 1147 PRId64 " supporting Ifile = %" 1148 PRId64 " bytes to clean %d segs (%" PRId64 "%% recovery)", 1149 fs->lfs_fsmnt, (int64_t)nb, (int64_t)(extra - if_extra), 1150 (int64_t)if_extra, (int64_t)(nb + extra), ngood, 1151 (ngood ? (int64_t)(100 - (100 * (nb + extra)) / 1152 (ngood * fs->lfs_ssize)) : 1153 (int64_t)0)); 1154 if (nb + extra >= ngood * fs->lfs_ssize) 1155 syslog(LOG_WARNING, "%s: cleaner not making forward progress", 1156 fs->lfs_fsmnt); 1157 1158 /* 1159 * Finally call reclaim to prompt cleaning of the segments. 1160 */ 1161 fcntl(fs->clfs_ifilefd, LFCNRECLAIM, NULL); 1162 1163 fd_release_all(fs->clfs_devvp); 1164 return 0; 1165 } 1166 1167 /* 1168 * Read the cleanerinfo block and apply cleaning policy to determine whether 1169 * the given filesystem needs to be cleaned. Returns 1 if it does, 0 if it 1170 * does not, or -1 on error. 1171 */ 1172 int 1173 needs_cleaning(struct clfs *fs, CLEANERINFO *cip) 1174 { 1175 struct ubuf *bp; 1176 struct stat st; 1177 daddr_t fsb_per_seg, max_free_segs; 1178 time_t now; 1179 double loadavg; 1180 1181 /* If this fs is "on hold", don't clean it. */ 1182 if (fs->clfs_onhold) 1183 return 0; 1184 1185 /* 1186 * Read the cleanerinfo block from the Ifile. We don't want 1187 * the cached information, so invalidate the buffer before 1188 * handing it back. 1189 */ 1190 if (bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp)) { 1191 syslog(LOG_ERR, "%s: can't read inode", fs->lfs_fsmnt); 1192 return -1; 1193 } 1194 *cip = *(CLEANERINFO *)bp->b_data; /* Structure copy */ 1195 brelse(bp, B_INVAL); 1196 cleaner_stats.bytes_read += fs->lfs_bsize; 1197 1198 /* 1199 * If the number of segments changed under us, reinit. 1200 * We don't have to start over from scratch, however, 1201 * since we don't hold any buffers. 1202 */ 1203 if (fs->lfs_nseg != cip->clean + cip->dirty) { 1204 if (reinit_fs(fs) < 0) { 1205 /* The normal case for unmount */ 1206 syslog(LOG_NOTICE, "%s: filesystem unmounted", fs->lfs_fsmnt); 1207 return -1; 1208 } 1209 syslog(LOG_NOTICE, "%s: nsegs changed", fs->lfs_fsmnt); 1210 } 1211 1212 /* Compute theoretical "free segments" maximum based on usage */ 1213 fsb_per_seg = segtod(fs, 1); 1214 max_free_segs = MAX(cip->bfree, 0) / fsb_per_seg + fs->lfs_minfreeseg; 1215 1216 dlog("%s: bfree = %d, avail = %d, clean = %d/%d", 1217 fs->lfs_fsmnt, cip->bfree, cip->avail, cip->clean, fs->lfs_nseg); 1218 1219 /* If the writer is waiting on us, clean it */ 1220 if (cip->clean <= fs->lfs_minfreeseg || 1221 (cip->flags & LFS_CLEANER_MUST_CLEAN)) 1222 return 1; 1223 1224 /* If there are enough segments, don't clean it */ 1225 if (cip->bfree - cip->avail <= fsb_per_seg && 1226 cip->avail > fsb_per_seg) 1227 return 0; 1228 1229 /* If we are in dire straits, clean it */ 1230 if (cip->bfree - cip->avail > fsb_per_seg && 1231 cip->avail <= fsb_per_seg) 1232 return 1; 1233 1234 /* If under busy threshold, clean regardless of load */ 1235 if (cip->clean < max_free_segs * BUSY_LIM) 1236 return 1; 1237 1238 /* Check busy status; clean if idle and under idle limit */ 1239 if (use_fs_idle) { 1240 /* Filesystem idle */ 1241 time(&now); 1242 if (fstat(fs->clfs_ifilefd, &st) < 0) { 1243 syslog(LOG_ERR, "%s: failed to stat ifile", 1244 fs->lfs_fsmnt); 1245 return -1; 1246 } 1247 if (now - st.st_mtime > segwait_timeout && 1248 cip->clean < max_free_segs * IDLE_LIM) 1249 return 1; 1250 } else { 1251 /* CPU idle - use one-minute load avg */ 1252 if (getloadavg(&loadavg, 1) == -1) { 1253 syslog(LOG_ERR, "%s: failed to get load avg", 1254 fs->lfs_fsmnt); 1255 return -1; 1256 } 1257 if (loadavg < load_threshold && 1258 cip->clean < max_free_segs * IDLE_LIM) 1259 return 1; 1260 } 1261 1262 return 0; 1263 } 1264 1265 /* 1266 * Report statistics. If the signal was SIGUSR2, clear the statistics too. 1267 * If the signal was SIGINT, exit. 1268 */ 1269 static void 1270 sig_report(int sig) 1271 { 1272 double avg = 0.0, stddev; 1273 1274 avg = cleaner_stats.util_tot / MAX(cleaner_stats.segs_cleaned, 1.0); 1275 stddev = cleaner_stats.util_sos / MAX(cleaner_stats.segs_cleaned - 1276 avg * avg, 1.0); 1277 syslog(LOG_INFO, "bytes read: %" PRId64, cleaner_stats.bytes_read); 1278 syslog(LOG_INFO, "bytes written: %" PRId64, cleaner_stats.bytes_written); 1279 syslog(LOG_INFO, "segments cleaned: %" PRId64, cleaner_stats.segs_cleaned); 1280 #if 0 1281 /* "Empty segments" is meaningless, since the kernel handles those */ 1282 syslog(LOG_INFO, "empty segments: %" PRId64, cleaner_stats.segs_empty); 1283 #endif 1284 syslog(LOG_INFO, "error segments: %" PRId64, cleaner_stats.segs_error); 1285 syslog(LOG_INFO, "utilization total: %g", cleaner_stats.util_tot); 1286 syslog(LOG_INFO, "utilization sos: %g", cleaner_stats.util_sos); 1287 syslog(LOG_INFO, "utilization avg: %4.2f", avg); 1288 syslog(LOG_INFO, "utilization sdev: %9.6f", stddev); 1289 1290 if (debug) 1291 bufstats(); 1292 1293 if (sig == SIGUSR2) 1294 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1295 if (sig == SIGINT) 1296 exit(0); 1297 } 1298 1299 static void 1300 sig_exit(int sig) 1301 { 1302 exit(0); 1303 } 1304 1305 static void 1306 usage(void) 1307 { 1308 errx(1, "usage: lfs_cleanerd [-bcdfmqs] [-i segnum] [-l load] " 1309 "[-n nsegs] [-r report_freq] [-t timeout] fs_name ..."); 1310 } 1311 1312 /* 1313 * Main. 1314 */ 1315 int 1316 main(int argc, char **argv) 1317 { 1318 int i, opt, error, r, loopcount; 1319 struct timeval tv; 1320 CLEANERINFO ci; 1321 #ifndef USE_CLIENT_SERVER 1322 char *cp, *pidname; 1323 #endif 1324 1325 /* 1326 * Set up defaults 1327 */ 1328 atatime = 1; 1329 segwait_timeout = 300; /* Five minutes */ 1330 load_threshold = 0.2; 1331 stat_report = 0; 1332 inval_segment = -1; 1333 copylog_filename = NULL; 1334 1335 /* 1336 * Parse command-line arguments 1337 */ 1338 while ((opt = getopt(argc, argv, "bC:cdfi:l:mn:qr:st:")) != -1) { 1339 switch (opt) { 1340 case 'b': /* Use bytes written, not segments read */ 1341 use_bytes = 1; 1342 break; 1343 case 'C': /* copy log */ 1344 copylog_filename = optarg; 1345 break; 1346 case 'c': /* Coalesce files */ 1347 do_coalesce++; 1348 break; 1349 case 'd': /* Debug mode. */ 1350 debug++; 1351 break; 1352 case 'f': /* Use fs idle time rather than cpu idle */ 1353 use_fs_idle = 1; 1354 break; 1355 case 'i': /* Invalidate this segment */ 1356 inval_segment = atoi(optarg); 1357 break; 1358 case 'l': /* Load below which to clean */ 1359 load_threshold = atof(optarg); 1360 break; 1361 case 'm': /* [compat only] */ 1362 break; 1363 case 'n': /* How many segs to clean at once */ 1364 atatime = atoi(optarg); 1365 break; 1366 case 'q': /* Quit after one run */ 1367 do_quit = 1; 1368 break; 1369 case 'r': /* Report every stat_report segments */ 1370 stat_report = atoi(optarg); 1371 break; 1372 case 's': /* Small writes */ 1373 do_small = 1; 1374 break; 1375 case 't': /* timeout */ 1376 segwait_timeout = atoi(optarg); 1377 break; 1378 default: 1379 usage(); 1380 /* NOTREACHED */ 1381 } 1382 } 1383 argc -= optind; 1384 argv += optind; 1385 1386 if (argc < 1) 1387 usage(); 1388 if (inval_segment >= 0 && argc != 1) { 1389 errx(1, "lfs_cleanerd: may only specify one filesystem when " 1390 "using -i flag"); 1391 } 1392 1393 if (do_coalesce) { 1394 errx(1, "lfs_cleanerd: -c disabled due to reports of file " 1395 "corruption; you may re-enable it by rebuilding the " 1396 "cleaner"); 1397 } 1398 1399 /* 1400 * Set up daemon mode or verbose debug mode 1401 */ 1402 if (debug) { 1403 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID | LOG_PERROR, 1404 LOG_DAEMON); 1405 signal(SIGINT, sig_report); 1406 } else { 1407 if (daemon(0, 0) == -1) 1408 err(1, "lfs_cleanerd: couldn't become a daemon!"); 1409 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID, LOG_DAEMON); 1410 signal(SIGINT, sig_exit); 1411 } 1412 1413 /* 1414 * Look for an already-running master daemon. If there is one, 1415 * send it our filesystems to add to its list and exit. 1416 * If there is none, become the master. 1417 */ 1418 #ifdef USE_CLIENT_SERVER 1419 try_to_become_master(argc, argv); 1420 #else 1421 /* XXX think about this */ 1422 asprintf(&pidname, "lfs_cleanerd:m:%s", argv[0]); 1423 if (pidname == NULL) { 1424 syslog(LOG_ERR, "malloc failed: %m"); 1425 exit(1); 1426 } 1427 for (cp = pidname; cp != NULL; cp = strchr(cp, '/')) 1428 *cp = '|'; 1429 pidfile(pidname); 1430 #endif 1431 1432 /* 1433 * Signals mean daemon should report its statistics 1434 */ 1435 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1436 signal(SIGUSR1, sig_report); 1437 signal(SIGUSR2, sig_report); 1438 1439 /* 1440 * Start up buffer cache. We only use this for the Ifile, 1441 * and we will resize it if necessary, so it can start small. 1442 */ 1443 bufinit(4); 1444 1445 #ifdef REPAIR_ZERO_FINFO 1446 { 1447 BLOCK_INFO *bip = NULL; 1448 int bic = 0; 1449 1450 nfss = 1; 1451 fsp = (struct clfs **)malloc(sizeof(*fsp)); 1452 fsp[0] = (struct clfs *)calloc(1, sizeof(**fsp)); 1453 1454 if (init_unmounted_fs(fsp[0], argv[0]) < 0) { 1455 err(1, "init_unmounted_fs"); 1456 } 1457 dlog("Filesystem has %d segments", fsp[0]->lfs_nseg); 1458 for (i = 0; i < fsp[0]->lfs_nseg; i++) { 1459 load_segment(fsp[0], i, &bip, &bic); 1460 bic = 0; 1461 } 1462 exit(0); 1463 } 1464 #endif 1465 1466 /* 1467 * Initialize cleaning structures, open devices, etc. 1468 */ 1469 nfss = argc; 1470 fsp = (struct clfs **)malloc(nfss * sizeof(*fsp)); 1471 if (fsp == NULL) { 1472 syslog(LOG_ERR, "couldn't allocate fs table: %m"); 1473 exit(1); 1474 } 1475 for (i = 0; i < nfss; i++) { 1476 fsp[i] = (struct clfs *)calloc(1, sizeof(**fsp)); 1477 if ((r = init_fs(fsp[i], argv[i])) < 0) { 1478 syslog(LOG_ERR, "%s: couldn't init: error code %d", 1479 argv[i], r); 1480 handle_error(fsp, i); 1481 --i; /* Do the new #i over again */ 1482 } 1483 } 1484 1485 /* 1486 * If asked to coalesce, do so and exit. 1487 */ 1488 if (do_coalesce) { 1489 for (i = 0; i < nfss; i++) 1490 clean_all_inodes(fsp[i]); 1491 exit(0); 1492 } 1493 1494 /* 1495 * If asked to invalidate a segment, do that and exit. 1496 */ 1497 if (inval_segment >= 0) { 1498 invalidate_segment(fsp[0], inval_segment); 1499 exit(0); 1500 } 1501 1502 /* 1503 * Main cleaning loop. 1504 */ 1505 loopcount = 0; 1506 while (nfss > 0) { 1507 int cleaned_one; 1508 do { 1509 #ifdef USE_CLIENT_SERVER 1510 check_control_socket(); 1511 #endif 1512 cleaned_one = 0; 1513 for (i = 0; i < nfss; i++) { 1514 if ((error = needs_cleaning(fsp[i], &ci)) < 0) { 1515 handle_error(fsp, i); 1516 continue; 1517 } 1518 if (error == 0) /* No need to clean */ 1519 continue; 1520 1521 reload_ifile(fsp[i]); 1522 if (clean_fs(fsp[i], &ci) < 0) { 1523 handle_error(fsp, i); 1524 continue; 1525 } 1526 ++cleaned_one; 1527 } 1528 ++loopcount; 1529 if (stat_report && loopcount % stat_report == 0) 1530 sig_report(0); 1531 if (do_quit) 1532 exit(0); 1533 } while(cleaned_one); 1534 tv.tv_sec = segwait_timeout; 1535 tv.tv_usec = 0; 1536 fcntl(fsp[0]->clfs_ifilefd, LFCNSEGWAITALL, &tv); 1537 } 1538 1539 /* NOTREACHED */ 1540 return 0; 1541 } 1542