1 /* $NetBSD: lfs_cleanerd.c,v 1.7 2006/05/12 19:33:02 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * The cleaner daemon for the NetBSD Log-structured File System. 41 * Only tested for use with version 2 LFSs. 42 */ 43 44 #include <sys/syslog.h> 45 #include <sys/param.h> 46 #include <sys/mount.h> 47 #include <sys/stat.h> 48 #include <ufs/ufs/inode.h> 49 #include <ufs/lfs/lfs.h> 50 51 #include <assert.h> 52 #include <err.h> 53 #include <errno.h> 54 #include <fcntl.h> 55 #include <stdio.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <unistd.h> 59 #include <time.h> 60 #include <util.h> 61 62 #include "bufcache.h" 63 #include "vnode.h" 64 #include "lfs_user.h" 65 #include "fdfs.h" 66 #include "cleaner.h" 67 68 /* 69 * Global variables. 70 */ 71 /* XXX these top few should really be fs-specific */ 72 int use_fs_idle; /* Use fs idle rather than cpu idle time */ 73 int use_bytes; /* Use bytes written rather than segments cleaned */ 74 int load_threshold; /* How idle is idle (CPU idle) */ 75 int atatime; /* How many segments (bytes) to clean at a time */ 76 77 int nfss; /* Number of filesystems monitored by this cleanerd */ 78 struct clfs **fsp; /* Array of extended filesystem structures */ 79 int segwait_timeout; /* Time to wait in lfs_segwait() */ 80 int do_quit; /* Quit after one cleaning loop */ 81 int do_coalesce; /* Coalesce filesystem */ 82 int do_small; /* Use small writes through markv */ 83 char *copylog_filename; /* File to use for fs debugging analysis */ 84 int inval_segment; /* Segment to invalidate */ 85 int stat_report; /* Report statistics for this period of cycles */ 86 int debug; /* Turn on debugging */ 87 struct cleaner_stats { 88 double util_tot; 89 double util_sos; 90 off_t bytes_read; 91 off_t bytes_written; 92 off_t segs_cleaned; 93 off_t segs_empty; 94 off_t segs_error; 95 } cleaner_stats; 96 97 extern u_int32_t cksum(void *, size_t); 98 extern u_int32_t lfs_sb_cksum(struct dlfs *); 99 extern u_int32_t lfs_cksum_part(void *, size_t, u_int32_t); 100 extern int ufs_getlbns(struct lfs *, struct uvnode *, daddr_t, struct indir *, int *); 101 102 /* Compat */ 103 void pwarn(const char *unused, ...) { /* Does nothing */ }; 104 105 /* 106 * Log a message if debugging is turned on. 107 */ 108 void 109 dlog(char *fmt, ...) 110 { 111 va_list ap; 112 113 if (debug == 0) 114 return; 115 116 va_start(ap, fmt); 117 vsyslog(LOG_DEBUG, fmt, ap); 118 va_end(ap); 119 } 120 121 /* 122 * Remove the specified filesystem from the list, due to its having 123 * become unmounted or other error condition. 124 */ 125 void 126 handle_error(struct clfs **fsp, int n) 127 { 128 syslog(LOG_NOTICE, "%s: detaching cleaner", fsp[n]->lfs_fsmnt); 129 free(fsp[n]); 130 if (n != nfss - 1) 131 fsp[n] = fsp[nfss - 1]; 132 --nfss; 133 } 134 135 /* 136 * Reinitialize a filesystem if, e.g., its size changed. 137 */ 138 int 139 reinit_fs(struct clfs *fs) 140 { 141 char fsname[MNAMELEN]; 142 143 strncpy(fsname, (char *)fs->lfs_fsmnt, MNAMELEN); 144 close(fs->clfs_ifilefd); 145 close(fs->clfs_devfd); 146 fd_reclaim(fs->clfs_devvp); 147 fd_reclaim(fs->lfs_ivnode); 148 free(fs->clfs_dev); 149 free(fs->clfs_segtab); 150 free(fs->clfs_segtabp); 151 152 return init_fs(fs, fsname); 153 } 154 155 #ifdef REPAIR_ZERO_FINFO 156 /* 157 * Use fsck's lfs routines to load the Ifile from an unmounted fs. 158 * We interpret "fsname" as the name of the raw disk device. 159 */ 160 int 161 init_unmounted_fs(struct clfs *fs, char *fsname) 162 { 163 struct lfs *disc_fs; 164 int i; 165 166 fs->clfs_dev = fsname; 167 if ((fs->clfs_devfd = open(fs->clfs_dev, O_RDWR)) < 0) { 168 syslog(LOG_ERR, "couldn't open device %s read/write", 169 fs->clfs_dev); 170 return -1; 171 } 172 173 disc_fs = lfs_init(fs->clfs_devfd, 0, 0, 0, 0); 174 175 fs->lfs_dlfs = disc_fs->lfs_dlfs; /* Structure copy */ 176 strncpy(fs->lfs_fsmnt, fsname, MNAMELEN); 177 fs->lfs_ivnode = (struct uvnode *)disc_fs->lfs_ivnode; 178 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 179 atatime); 180 181 /* Allocate and clear segtab */ 182 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 183 sizeof(*fs->clfs_segtab)); 184 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 185 sizeof(*fs->clfs_segtabp)); 186 for (i = 0; i < fs->lfs_nseg; i++) { 187 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 188 fs->clfs_segtab[i].flags = 0x0; 189 } 190 syslog(LOG_NOTICE, "%s: unmounted cleaner starting", fsname); 191 192 return 0; 193 } 194 #endif 195 196 /* 197 * Set up the file descriptors, including the Ifile descriptor. 198 * If we can't get the Ifile, this is not an LFS (or the kernel is 199 * too old to support the fcntl). 200 * XXX Merge this and init_unmounted_fs, switching on whether 201 * XXX "fsname" is a dir or a char special device. Should 202 * XXX also be able to read unmounted devices out of fstab, the way 203 * XXX fsck does. 204 */ 205 int 206 init_fs(struct clfs *fs, char *fsname) 207 { 208 struct statvfs sf; 209 int rootfd; 210 int i; 211 212 /* 213 * Get the raw device from the block device. 214 * XXX this is ugly. Is there a way to discover the raw device 215 * XXX for a given mount point? 216 */ 217 if (statvfs(fsname, &sf) < 0) 218 return -1; 219 fs->clfs_dev = malloc(strlen(sf.f_mntfromname) + 2); 220 if (fs->clfs_dev == NULL) { 221 syslog(LOG_ERR, "couldn't malloc device name string: %m"); 222 return -1; 223 } 224 sprintf(fs->clfs_dev, "/dev/r%s", sf.f_mntfromname + 5); 225 if ((fs->clfs_devfd = open(fs->clfs_dev, O_RDONLY)) < 0) { 226 syslog(LOG_ERR, "couldn't open device %s for reading", 227 fs->clfs_dev); 228 return -1; 229 } 230 231 /* Find the Ifile and open it */ 232 if ((rootfd = open(fsname, O_RDONLY)) < 0) 233 return -2; 234 if (fcntl(rootfd, LFCNIFILEFH, &fs->clfs_ifilefh) < 0) 235 return -3; 236 if ((fs->clfs_ifilefd = fhopen(&fs->clfs_ifilefh, O_RDONLY)) < 0) 237 return -4; 238 close(rootfd); 239 240 /* Load in the superblock */ 241 if (pread(fs->clfs_devfd, &(fs->lfs_dlfs), sizeof(struct dlfs), 242 LFS_LABELPAD) < 0) 243 return -1; 244 245 /* If this is not a version 2 filesystem, complain and exit */ 246 if (fs->lfs_version != 2) { 247 syslog(LOG_ERR, "%s: not a version 2 LFS", fsname); 248 return -1; 249 } 250 251 /* Assume fsname is the mounted name */ 252 strncpy((char *)fs->lfs_fsmnt, fsname, MNAMELEN); 253 254 /* Set up vnodes for Ifile and raw device */ 255 fs->lfs_ivnode = fd_vget(fs->clfs_ifilefd, fs->lfs_bsize, 0, 0); 256 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 257 atatime); 258 259 /* Allocate and clear segtab */ 260 fs->clfs_segtab = (struct clfs_seguse *)malloc(fs->lfs_nseg * 261 sizeof(*fs->clfs_segtab)); 262 fs->clfs_segtabp = (struct clfs_seguse **)malloc(fs->lfs_nseg * 263 sizeof(*fs->clfs_segtabp)); 264 if (fs->clfs_segtab == NULL || fs->clfs_segtabp == NULL) { 265 syslog(LOG_ERR, "%s: couldn't malloc segment table: %m", 266 fs->clfs_dev); 267 return -1; 268 } 269 270 for (i = 0; i < fs->lfs_nseg; i++) { 271 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 272 fs->clfs_segtab[i].flags = 0x0; 273 } 274 275 syslog(LOG_NOTICE, "%s: attaching cleaner", fsname); 276 return 0; 277 } 278 279 /* 280 * Invalidate all the currently held Ifile blocks so they will be 281 * reread when we clean. Check the size while we're at it, and 282 * resize the buffer cache if necessary. 283 */ 284 void 285 reload_ifile(struct clfs *fs) 286 { 287 struct ubuf *bp; 288 struct stat st; 289 int ohashmax; 290 extern int hashmax; 291 292 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) != NULL) { 293 bremfree(bp); 294 buf_destroy(bp); 295 } 296 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_cleanblkhd)) != NULL) { 297 bremfree(bp); 298 buf_destroy(bp); 299 } 300 301 /* If Ifile is larger than buffer cache, rehash */ 302 fstat(fs->clfs_ifilefd, &st); 303 if (st.st_size / fs->lfs_bsize > hashmax) { 304 ohashmax = hashmax; 305 bufrehash(st.st_size / fs->lfs_bsize); 306 dlog("%s: resized buffer hash from %d to %d", 307 fs->lfs_fsmnt, ohashmax, hashmax); 308 } 309 } 310 311 /* 312 * Get IFILE entry for the given inode, store in ifpp. The buffer 313 * which contains that data is returned in bpp, and must be brelse()d 314 * by the caller. 315 */ 316 void 317 lfs_ientry(IFILE **ifpp, struct clfs *fs, ino_t ino, struct ubuf **bpp) 318 { 319 int error; 320 321 error = bread(fs->lfs_ivnode, ino / fs->lfs_ifpb + fs->lfs_cleansz + 322 fs->lfs_segtabsz, fs->lfs_bsize, NOCRED, bpp); 323 *ifpp = (IFILE *)(*bpp)->b_data + ino % fs->lfs_ifpb; 324 return; 325 } 326 327 #ifdef TEST_PATTERN 328 /* 329 * Check ROOTINO for file data. The assumption is that we are running 330 * the "twofiles" test with the rest of the filesystem empty. Files 331 * created by "twofiles" match the test pattern, but ROOTINO and the 332 * executable itself (assumed to be inode 3) should not match. 333 */ 334 static void 335 check_test_pattern(BLOCK_INFO *bip) 336 { 337 int j; 338 unsigned char *cp = bip->bi_bp; 339 340 /* Check inode sanity */ 341 if (bip->bi_lbn == LFS_UNUSED_LBN) { 342 assert(((struct ufs1_dinode *)bip->bi_bp)->di_inumber == 343 bip->bi_inode); 344 } 345 346 /* These can have the test pattern and it's all good */ 347 if (bip->bi_inode > 3) 348 return; 349 350 for (j = 0; j < bip->bi_size; j++) { 351 if (cp[j] != (j & 0xff)) 352 break; 353 } 354 assert(j < bip->bi_size); 355 } 356 #endif /* TEST_PATTERN */ 357 358 /* 359 * Parse the partial segment at daddr, adding its information to 360 * bip. Return the address of the next partial segment to read. 361 */ 362 int32_t 363 parse_pseg(struct clfs *fs, daddr_t daddr, BLOCK_INFO **bipp, int *bic) 364 { 365 SEGSUM *ssp; 366 IFILE *ifp; 367 BLOCK_INFO *bip, *nbip; 368 int32_t *iaddrp, idaddr, odaddr; 369 FINFO *fip; 370 struct ubuf *ifbp; 371 struct ufs1_dinode *dip; 372 u_int32_t ck, vers; 373 int fic, inoc, obic; 374 int i; 375 char *cp; 376 377 odaddr = daddr; 378 obic = *bic; 379 bip = *bipp; 380 381 /* 382 * Retrieve the segment header, set up the SEGSUM pointer 383 * as well as the first FINFO and inode address pointer. 384 */ 385 cp = fd_ptrget(fs->clfs_devvp, daddr); 386 ssp = (SEGSUM *)cp; 387 iaddrp = ((int32_t *)(cp + fs->lfs_ibsize)) - 1; 388 fip = (FINFO *)(cp + sizeof(SEGSUM)); 389 390 /* 391 * Check segment header magic and checksum 392 */ 393 if (ssp->ss_magic != SS_MAGIC) { 394 syslog(LOG_WARNING, "%s: sumsum magic number bad at 0x%x:" 395 " read 0x%x, expected 0x%x", fs->lfs_fsmnt, 396 (int32_t)daddr, ssp->ss_magic, SS_MAGIC); 397 return 0x0; 398 } 399 ck = cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); 400 if (ck != ssp->ss_sumsum) { 401 syslog(LOG_WARNING, "%s: sumsum checksum mismatch at 0x%x:" 402 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, 403 (int32_t)daddr, ssp->ss_sumsum, ck); 404 return 0x0; 405 } 406 407 /* Initialize data sum */ 408 ck = 0; 409 410 /* Point daddr at next block after segment summary */ 411 ++daddr; 412 413 /* 414 * Loop over file info and inode pointers. We always move daddr 415 * forward here because we are also computing the data checksum 416 * as we go. 417 */ 418 fic = inoc = 0; 419 while (fic < ssp->ss_nfinfo || inoc < ssp->ss_ninos) { 420 /* 421 * We must have either a file block or an inode block. 422 * If we don't have either one, it's an error. 423 */ 424 if (fic >= ssp->ss_nfinfo && *iaddrp != daddr) { 425 syslog(LOG_WARNING, "%s: bad pseg at %x (seg %d)", 426 fs->lfs_fsmnt, odaddr, dtosn(fs, odaddr)); 427 *bipp = bip; 428 return 0x0; 429 } 430 431 /* 432 * Note each inode from the inode blocks 433 */ 434 if (inoc < ssp->ss_ninos && *iaddrp == daddr) { 435 cp = fd_ptrget(fs->clfs_devvp, daddr); 436 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 437 dip = (struct ufs1_dinode *)cp; 438 for (i = 0; i < fs->lfs_inopb; i++) { 439 if (dip[i].di_inumber == 0) 440 break; 441 442 /* 443 * Check currency before adding it 444 */ 445 #ifndef REPAIR_ZERO_FINFO 446 lfs_ientry(&ifp, fs, dip[i].di_inumber, &ifbp); 447 idaddr = ifp->if_daddr; 448 brelse(ifbp); 449 if (idaddr != daddr) 450 #endif 451 continue; 452 453 /* 454 * A current inode. Add it. 455 */ 456 ++*bic; 457 nbip = (BLOCK_INFO *)realloc(bip, *bic * 458 sizeof(*bip)); 459 if (nbip) 460 bip = nbip; 461 else { 462 --*bic; 463 *bipp = bip; 464 return 0x0; 465 } 466 bip[*bic - 1].bi_inode = dip[i].di_inumber; 467 bip[*bic - 1].bi_lbn = LFS_UNUSED_LBN; 468 bip[*bic - 1].bi_daddr = daddr; 469 bip[*bic - 1].bi_segcreate = ssp->ss_create; 470 bip[*bic - 1].bi_version = dip[i].di_gen; 471 bip[*bic - 1].bi_bp = &(dip[i]); 472 bip[*bic - 1].bi_size = DINODE1_SIZE; 473 } 474 inoc += i; 475 daddr += btofsb(fs, fs->lfs_ibsize); 476 --iaddrp; 477 continue; 478 } 479 480 /* 481 * Note each file block from the finfo blocks 482 */ 483 if (fic >= ssp->ss_nfinfo) 484 continue; 485 486 /* Count this finfo, whether or not we use it */ 487 ++fic; 488 489 /* 490 * If this finfo has nblocks==0, it was written wrong. 491 * Kernels with this problem always wrote this zero-sized 492 * finfo last, so just ignore it. 493 */ 494 if (fip->fi_nblocks == 0) { 495 #ifdef REPAIR_ZERO_FINFO 496 struct ubuf *nbp; 497 SEGSUM *nssp; 498 499 syslog(LOG_WARNING, "fixing short FINFO at %x (seg %d)", 500 odaddr, dtosn(fs, odaddr)); 501 bread(fs->clfs_devvp, odaddr, fs->lfs_fsize, NOCRED, &nbp); 502 nssp = (SEGSUM *)nbp->b_data; 503 --nssp->ss_nfinfo; 504 nssp->ss_sumsum = cksum(&nssp->ss_datasum, 505 fs->lfs_sumsize - sizeof(nssp->ss_sumsum)); 506 bwrite(nbp); 507 #endif 508 continue; 509 } 510 511 /* 512 * Check currency before adding blocks 513 */ 514 #ifdef REPAIR_ZERO_FINFO 515 vers = -1; 516 #else 517 lfs_ientry(&ifp, fs, fip->fi_ino, &ifbp); 518 vers = ifp->if_version; 519 brelse(ifbp); 520 #endif 521 if (vers != fip->fi_version) { 522 size_t size; 523 524 /* Read all the blocks from the data summary */ 525 for (i = 0; i < fip->fi_nblocks; i++) { 526 size = (i == fip->fi_nblocks - 1) ? 527 fip->fi_lastlength : fs->lfs_bsize; 528 cp = fd_ptrget(fs->clfs_devvp, daddr); 529 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 530 daddr += btofsb(fs, size); 531 } 532 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 533 continue; 534 } 535 536 /* Add all the blocks from the finfos (current or not) */ 537 nbip = (BLOCK_INFO *)realloc(bip, (*bic + fip->fi_nblocks) * 538 sizeof(*bip)); 539 if (nbip) 540 bip = nbip; 541 else { 542 *bipp = bip; 543 return 0x0; 544 } 545 546 for (i = 0; i < fip->fi_nblocks; i++) { 547 bip[*bic + i].bi_inode = fip->fi_ino; 548 bip[*bic + i].bi_lbn = fip->fi_blocks[i]; 549 bip[*bic + i].bi_daddr = daddr; 550 bip[*bic + i].bi_segcreate = ssp->ss_create; 551 bip[*bic + i].bi_version = fip->fi_version; 552 bip[*bic + i].bi_size = (i == fip->fi_nblocks - 1) ? 553 fip->fi_lastlength : fs->lfs_bsize; 554 cp = fd_ptrget(fs->clfs_devvp, daddr); 555 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 556 bip[*bic + i].bi_bp = cp; 557 daddr += btofsb(fs, bip[*bic + i].bi_size); 558 559 #ifdef TEST_PATTERN 560 check_test_pattern(bip + *bic + i); /* XXXDEBUG */ 561 #endif 562 } 563 *bic += fip->fi_nblocks; 564 fip = (FINFO *)(fip->fi_blocks + fip->fi_nblocks); 565 } 566 567 #ifndef REPAIR_ZERO_FINFO 568 if (ssp->ss_datasum != ck) { 569 syslog(LOG_WARNING, "%s: data checksum bad at 0x%x:" 570 " read 0x%x, computed 0x%x", fs->lfs_fsmnt, odaddr, 571 ssp->ss_datasum, ck); 572 *bic = obic; 573 return 0x0; 574 } 575 #endif 576 577 *bipp = bip; 578 return daddr; 579 } 580 581 static void 582 log_segment_read(struct clfs *fs, int sn) 583 { 584 FILE *fp; 585 char *cp; 586 587 /* 588 * Write the segment read, and its contents, into a log file in 589 * the current directory. We don't need to log the location of 590 * the segment, since that can be inferred from the segments up 591 * to this point (ss_nextseg field of the previously written segment). 592 * 593 * We can use this info later to reconstruct the filesystem at any 594 * given point in time for analysis, by replaying the log forward 595 * indexed by the segment serial numbers; but it is not suitable 596 * for everyday use since the copylog will be simply enormous. 597 */ 598 cp = fd_ptrget(fs->clfs_devvp, sntod(fs, sn)); 599 600 fp = fopen(copylog_filename, "ab"); 601 if (fp != NULL) { 602 if (fwrite(cp, (size_t)fs->lfs_ssize, 1, fp) < 0) { 603 perror("writing segment to copy log"); 604 } 605 } 606 fclose(fp); 607 } 608 609 /* 610 * Read a segment to populate the BLOCK_INFO structures. 611 * Return the number of partial segments read and parsed. 612 */ 613 int 614 load_segment(struct clfs *fs, int sn, BLOCK_INFO **bipp, int *bic) 615 { 616 int32_t daddr; 617 int i, npseg; 618 619 daddr = sntod(fs, sn); 620 if (daddr < btofsb(fs, LFS_LABELPAD)) 621 daddr = btofsb(fs, LFS_LABELPAD); 622 for (i = 0; i < LFS_MAXNUMSB; i++) { 623 if (fs->lfs_sboffs[i] == daddr) { 624 daddr += btofsb(fs, LFS_SBPAD); 625 break; 626 } 627 } 628 629 /* Preload the segment buffer */ 630 if (fd_preload(fs->clfs_devvp, sntod(fs, sn)) < 0) 631 return -1; 632 633 if (copylog_filename) 634 log_segment_read(fs, sn); 635 636 /* Note bytes read for stats */ 637 cleaner_stats.segs_cleaned++; 638 cleaner_stats.bytes_read += fs->lfs_ssize; 639 ++fs->clfs_nactive; 640 641 npseg = 0; 642 while(dtosn(fs, daddr) == sn && 643 dtosn(fs, daddr + btofsb(fs, fs->lfs_bsize)) == sn) { 644 daddr = parse_pseg(fs, daddr, bipp, bic); 645 if (daddr == 0x0) { 646 ++cleaner_stats.segs_error; 647 break; 648 } 649 ++npseg; 650 } 651 652 return npseg; 653 } 654 655 void 656 calc_cb(struct clfs *fs, int sn, struct clfs_seguse *t) 657 { 658 time_t now; 659 int64_t age, benefit, cost; 660 661 time(&now); 662 age = (now < t->lastmod ? 0 : now - t->lastmod); 663 664 /* Under no circumstances clean active or already-clean segments */ 665 if ((t->flags & SEGUSE_ACTIVE) || !(t->flags & SEGUSE_DIRTY)) { 666 t->priority = 0; 667 return; 668 } 669 670 /* 671 * If the segment is empty, there is no reason to clean it. 672 * Clear its error condition, if any, since we are never going to 673 * try to parse this one. 674 */ 675 if (t->nbytes == 0) { 676 t->flags &= ~SEGUSE_ERROR; /* Strip error once empty */ 677 t->priority = 0; 678 return; 679 } 680 681 if (t->flags & SEGUSE_ERROR) { /* No good if not already empty */ 682 /* No benefit */ 683 t->priority = 0; 684 return; 685 } 686 687 if (t->nbytes < 0 || t->nbytes > fs->lfs_ssize) { 688 /* Another type of error */ 689 syslog(LOG_WARNING, "segment %d: bad seguse count %d", 690 sn, t->nbytes); 691 t->flags |= SEGUSE_ERROR; 692 t->priority = 0; 693 return; 694 } 695 696 /* 697 * The non-degenerate case. Use Rosenblum's cost-benefit algorithm. 698 * Calculate the benefit from cleaning this segment (one segment, 699 * minus fragmentation, dirty blocks and a segment summary block) 700 * and weigh that against the cost (bytes read plus bytes written). 701 * We count the summary headers as "dirty" to avoid cleaning very 702 * old and very full segments. 703 */ 704 benefit = (int64_t)fs->lfs_ssize - t->nbytes - 705 (t->nsums + 1) * fs->lfs_fsize; 706 if (fs->lfs_bsize > fs->lfs_fsize) /* fragmentation */ 707 benefit -= (fs->lfs_bsize / 2); 708 if (benefit <= 0) { 709 t->priority = 0; 710 return; 711 } 712 713 cost = fs->lfs_ssize + t->nbytes; 714 t->priority = (256 * benefit * age) / cost; 715 716 return; 717 } 718 719 /* 720 * Comparator for BLOCK_INFO structures. Anything not in one of the segments 721 * we're looking at sorts higher; after that we sort first by inode number 722 * and then by block number (unsigned, i.e., negative sorts higher) *but* 723 * sort inodes before data blocks. 724 */ 725 static int 726 bi_comparator(const void *va, const void *vb) 727 { 728 BLOCK_INFO *a, *b; 729 730 a = (BLOCK_INFO *)va; 731 b = (BLOCK_INFO *)vb; 732 733 /* Check for out-of-place block */ 734 if (a->bi_segcreate == a->bi_daddr && 735 b->bi_segcreate != b->bi_daddr) 736 return -1; 737 if (a->bi_segcreate != a->bi_daddr && 738 b->bi_segcreate == b->bi_daddr) 739 return 1; 740 if (a->bi_size <= 0 && b->bi_size > 0) 741 return 1; 742 if (b->bi_size <= 0 && a->bi_size > 0) 743 return -1; 744 745 /* Check inode number */ 746 if (a->bi_inode != b->bi_inode) 747 return a->bi_inode - b->bi_inode; 748 749 /* Check lbn */ 750 if (a->bi_lbn == LFS_UNUSED_LBN) /* Inodes sort lower than blocks */ 751 return -1; 752 if (b->bi_lbn == LFS_UNUSED_LBN) 753 return 1; 754 if ((u_int32_t)a->bi_lbn > (u_int32_t)b->bi_lbn) 755 return 1; 756 else 757 return -1; 758 } 759 760 /* 761 * Comparator for sort_segments: cost-benefit equation. 762 */ 763 static int 764 cb_comparator(const void *va, const void *vb) 765 { 766 struct clfs_seguse *a, *b; 767 768 a = *(struct clfs_seguse **)va; 769 b = *(struct clfs_seguse **)vb; 770 return a->priority > b->priority ? -1 : 1; 771 } 772 773 void 774 toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, int *bic, int *sizep) 775 { 776 int i, r; 777 BLOCK_INFO *bip = *bipp; 778 struct lfs_fcntl_markv /* { 779 BLOCK_INFO *blkiov; 780 int blkcnt; 781 } */ lim; 782 783 if (bic == 0 || bip == NULL) 784 return; 785 786 /* 787 * Kludge: Store the disk address in segcreate so we know which 788 * ones to toss. 789 */ 790 for (i = 0; i < *bic; i++) 791 bip[i].bi_segcreate = bip[i].bi_daddr; 792 793 /* Sort the blocks */ 794 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 795 796 /* Use bmapv to locate the blocks */ 797 lim.blkiov = bip; 798 lim.blkcnt = *bic; 799 if ((r = fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim)) < 0) { 800 syslog(LOG_WARNING, "%s: bmapv returned %d (%m)", 801 fs->lfs_fsmnt, r); 802 return; 803 } 804 805 /* Toss blocks not in this segment */ 806 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 807 808 /* Get rid of stale blocks */ 809 if (sizep) 810 *sizep = 0; 811 for (i = 0; i < *bic; i++) { 812 if (bip[i].bi_segcreate != bip[i].bi_daddr) 813 break; 814 if (sizep) 815 *sizep += bip[i].bi_size; 816 } 817 *bic = i; /* XXX realloc bip? */ 818 *bipp = bip; 819 820 return; 821 } 822 823 /* 824 * Clean a segment and mark it invalid. 825 */ 826 int 827 invalidate_segment(struct clfs *fs, int sn) 828 { 829 BLOCK_INFO *bip; 830 int i, r, bic; 831 off_t nb; 832 double util; 833 struct lfs_fcntl_markv /* { 834 BLOCK_INFO *blkiov; 835 int blkcnt; 836 } */ lim; 837 838 dlog("%s: inval seg %d", fs->lfs_fsmnt, sn); 839 840 bip = NULL; 841 bic = 0; 842 fs->clfs_nactive = 0; 843 if (load_segment(fs, sn, &bip, &bic) <= 0) 844 return -1; 845 toss_old_blocks(fs, &bip, &bic, NULL); 846 847 /* Record statistics */ 848 for (i = nb = 0; i < bic; i++) 849 nb += bip[i].bi_size; 850 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 851 cleaner_stats.util_tot += util; 852 cleaner_stats.util_sos += util * util; 853 cleaner_stats.bytes_written += nb; 854 855 /* 856 * Use markv to move the blocks. 857 */ 858 lim.blkiov = bip; 859 lim.blkcnt = bic; 860 if ((r = fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 861 syslog(LOG_WARNING, "%s: markv returned %d (%m) " 862 "for seg %d", fs->lfs_fsmnt, r, sn); 863 return r; 864 } 865 866 /* 867 * Finally call invalidate to invalidate the segment. 868 */ 869 if ((r = fcntl(fs->clfs_ifilefd, LFCNINVAL, &sn)) < 0) { 870 syslog(LOG_WARNING, "%s: inval returned %d (%m) " 871 "for seg %d", fs->lfs_fsmnt, r, sn); 872 return r; 873 } 874 875 return 0; 876 } 877 878 /* 879 * Check to see if the given ino/lbn pair is represented in the BLOCK_INFO 880 * array we are sending to the kernel, or if the kernel will have to add it. 881 * The kernel will only add each such pair once, though, so keep track of 882 * previous requests in a separate "extra" BLOCK_INFO array. Returns 1 883 * if the block needs to be added, 0 if it is already represented. 884 */ 885 static int 886 check_or_add(ino_t ino, int32_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp) 887 { 888 BLOCK_INFO *t, *ebip = *ebipp; 889 int ebic = *ebicp; 890 int k; 891 892 for (k = 0; k < bic; k++) { 893 if (bip[k].bi_inode != ino) 894 break; 895 if (bip[k].bi_lbn == lbn) { 896 return 0; 897 } 898 } 899 900 /* Look on the list of extra blocks, too */ 901 for (k = 0; k < ebic; k++) { 902 if (ebip[k].bi_inode == ino && ebip[k].bi_lbn == lbn) { 903 return 0; 904 } 905 } 906 907 ++ebic; 908 t = realloc(ebip, ebic * sizeof(BLOCK_INFO)); 909 if (t == NULL) 910 return 1; /* Note *ebipc is not updated */ 911 912 ebip = t; 913 ebip[ebic - 1].bi_inode = ino; 914 ebip[ebic - 1].bi_lbn = lbn; 915 916 *ebipp = ebip; 917 *ebicp = ebic; 918 return 1; 919 } 920 921 /* 922 * Look for indirect blocks we will have to write which are not 923 * contained in this collection of blocks. This constitutes 924 * a hidden cleaning cost, since we are unaware of it until we 925 * have already read the segments. Return the total cost, and fill 926 * in *ifc with the part of that cost due to rewriting the Ifile. 927 */ 928 static off_t 929 check_hidden_cost(struct clfs *fs, BLOCK_INFO *bip, int bic, off_t *ifc) 930 { 931 int start; 932 struct indir in[NIADDR + 1]; 933 int num; 934 int i, j, ebic; 935 BLOCK_INFO *ebip; 936 int32_t lbn; 937 938 start = 0; 939 ebip = NULL; 940 ebic = 0; 941 for (i = 0; i < bic; i++) { 942 if (i == 0 || bip[i].bi_inode != bip[start].bi_inode) { 943 start = i; 944 /* 945 * Look for IFILE blocks, unless this is the Ifile. 946 */ 947 if (bip[i].bi_inode != fs->lfs_ifile) { 948 lbn = fs->lfs_cleansz + bip[i].bi_inode / 949 fs->lfs_ifpb; 950 *ifc += check_or_add(fs->lfs_ifile, lbn, 951 bip, bic, &ebip, &ebic); 952 } 953 } 954 if (bip[i].bi_lbn == LFS_UNUSED_LBN) 955 continue; 956 if (bip[i].bi_lbn < NDADDR) 957 continue; 958 959 ufs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num); 960 for (j = 0; j < num; j++) { 961 check_or_add(bip[i].bi_inode, in[j].in_lbn, 962 bip + start, bic - start, &ebip, &ebic); 963 } 964 } 965 return ebic; 966 } 967 968 /* 969 * Select segments to clean, add blocks from these segments to a cleaning 970 * list, and send this list through lfs_markv() to move them to new 971 * locations on disk. 972 */ 973 int 974 clean_fs(struct clfs *fs, CLEANERINFO *cip) 975 { 976 int i, j, ngood, sn, bic, r, npos; 977 int bytes, totbytes; 978 struct ubuf *bp; 979 SEGUSE *sup; 980 static BLOCK_INFO *bip; 981 struct lfs_fcntl_markv /* { 982 BLOCK_INFO *blkiov; 983 int blkcnt; 984 } */ lim; 985 int mc; 986 BLOCK_INFO *mbip; 987 int inc; 988 off_t nb; 989 off_t goal; 990 off_t extra, if_extra; 991 double util; 992 993 /* Read the segment table into our private structure */ 994 npos = 0; 995 for (i = 0; i < fs->lfs_nseg; i+= fs->lfs_sepb) { 996 bread(fs->lfs_ivnode, fs->lfs_cleansz + i / fs->lfs_sepb, 997 fs->lfs_bsize, NOCRED, &bp); 998 for (j = 0; j < fs->lfs_sepb && i + j < fs->lfs_nseg; j++) { 999 sup = ((SEGUSE *)bp->b_data) + j; 1000 fs->clfs_segtab[i + j].nbytes = sup->su_nbytes; 1001 fs->clfs_segtab[i + j].nsums = sup->su_nsums; 1002 fs->clfs_segtab[i + j].lastmod = sup->su_lastmod; 1003 /* Keep error status but renew other flags */ 1004 fs->clfs_segtab[i + j].flags &= SEGUSE_ERROR; 1005 fs->clfs_segtab[i + j].flags |= sup->su_flags; 1006 1007 /* Compute cost-benefit coefficient */ 1008 calc_cb(fs, i + j, fs->clfs_segtab + i + j); 1009 if (fs->clfs_segtab[i + j].priority > 0) 1010 ++npos; 1011 } 1012 brelse(bp); 1013 } 1014 1015 /* Sort segments based on cleanliness, fulness, and condition */ 1016 heapsort(fs->clfs_segtabp, fs->lfs_nseg, sizeof(struct clfs_seguse *), 1017 cb_comparator); 1018 1019 /* If no segment is cleanable, just return */ 1020 if (fs->clfs_segtabp[0]->priority == 0) { 1021 dlog("%s: no segment cleanable", fs->lfs_fsmnt); 1022 return 0; 1023 } 1024 1025 /* Load some segments' blocks into bip */ 1026 bic = 0; 1027 fs->clfs_nactive = 0; 1028 ngood = 0; 1029 if (use_bytes) { 1030 /* Set attainable goal */ 1031 goal = fs->lfs_ssize * atatime; 1032 if (goal > (cip->clean - 1) * fs->lfs_ssize / 2) 1033 goal = MAX((cip->clean - 1) * fs->lfs_ssize, 1034 fs->lfs_ssize) / 2; 1035 1036 dlog("%s: cleaning with goal %" PRId64 1037 " bytes (%d segs clean, %d cleanable)", 1038 fs->lfs_fsmnt, goal, cip->clean, npos); 1039 syslog(LOG_INFO, "%s: cleaning with goal %" PRId64 1040 " bytes (%d segs clean, %d cleanable)", 1041 fs->lfs_fsmnt, goal, cip->clean, npos); 1042 totbytes = 0; 1043 for (i = 0; i < fs->lfs_nseg && totbytes < goal; i++) { 1044 if (fs->clfs_segtabp[i]->priority == 0) 1045 break; 1046 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1047 dlog("%s: add seg %d prio %" PRIu64 1048 " containing %ld bytes", 1049 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority, 1050 fs->clfs_segtabp[i]->nbytes); 1051 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) { 1052 ++ngood; 1053 toss_old_blocks(fs, &bip, &bic, &bytes); 1054 totbytes += bytes; 1055 } else if (r == 0) 1056 fd_release(fs->clfs_devvp); 1057 else 1058 break; 1059 } 1060 } else { 1061 /* Set attainable goal */ 1062 goal = atatime; 1063 if (goal > cip->clean - 1) 1064 goal = MAX(cip->clean - 1, 1); 1065 1066 dlog("%s: cleaning with goal %d segments (%d clean, %d cleanable)", 1067 fs->lfs_fsmnt, (int)goal, cip->clean, npos); 1068 for (i = 0; i < fs->lfs_nseg && ngood < goal; i++) { 1069 if (fs->clfs_segtabp[i]->priority == 0) 1070 break; 1071 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1072 dlog("%s: add seg %d prio %" PRIu64, 1073 fs->lfs_fsmnt, sn, fs->clfs_segtabp[i]->priority); 1074 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) 1075 ++ngood; 1076 else if (r == 0) 1077 fd_release(fs->clfs_devvp); 1078 else 1079 break; 1080 } 1081 toss_old_blocks(fs, &bip, &bic, NULL); 1082 } 1083 1084 /* If there is nothing to do, try again later. */ 1085 if (bic == 0) { 1086 dlog("%s: no blocks to clean in %d cleanable segments", 1087 fs->lfs_fsmnt, (int)ngood); 1088 fd_release_all(fs->clfs_devvp); 1089 return 0; 1090 } 1091 1092 /* Record statistics */ 1093 for (i = nb = 0; i < bic; i++) 1094 nb += bip[i].bi_size; 1095 util = ((double)nb) / (fs->clfs_nactive * fs->lfs_ssize); 1096 cleaner_stats.util_tot += util; 1097 cleaner_stats.util_sos += util * util; 1098 cleaner_stats.bytes_written += nb; 1099 1100 /* 1101 * Check out our blocks to see if there are hidden cleaning costs. 1102 * If there are, we might be cleaning ourselves deeper into a hole 1103 * rather than doing anything useful. 1104 * XXX do something about this. 1105 */ 1106 if_extra = 0; 1107 extra = fs->lfs_bsize * (off_t)check_hidden_cost(fs, bip, bic, &if_extra); 1108 if_extra *= fs->lfs_bsize; 1109 1110 /* 1111 * Use markv to move the blocks. 1112 */ 1113 if (do_small) 1114 inc = MAXPHYS / fs->lfs_bsize - 1; 1115 else 1116 inc = LFS_MARKV_MAXBLKCNT / 2; 1117 for (mc = 0, mbip = bip; mc < bic; mc += inc, mbip += inc) { 1118 lim.blkiov = mbip; 1119 lim.blkcnt = (bic - mc > inc ? inc : bic - mc); 1120 #ifdef TEST_PATTERN 1121 dlog("checking blocks %d-%d", mc, mc + lim.blkcnt - 1); 1122 for (i = 0; i < lim.blkcnt; i++) { 1123 check_test_pattern(mbip + i); 1124 } 1125 #endif /* TEST_PATTERN */ 1126 dlog("sending blocks %d-%d", mc, mc + lim.blkcnt - 1); 1127 if ((r = fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 1128 syslog(LOG_WARNING, "%s: markv returned %d (%m)", 1129 fs->lfs_fsmnt, r); 1130 if (errno != EAGAIN && errno != ESHUTDOWN) { 1131 fd_release_all(fs->clfs_devvp); 1132 return r; 1133 } 1134 } 1135 } 1136 1137 /* 1138 * Report progress (or lack thereof) 1139 */ 1140 syslog(LOG_INFO, "%s: wrote %" PRId64 " dirty + %" 1141 PRId64 " supporting indirect + %" 1142 PRId64 " supporting Ifile = %" 1143 PRId64 " bytes to clean %d segs (%" PRId64 "%% recovery)", 1144 fs->lfs_fsmnt, (int64_t)nb, (int64_t)(extra - if_extra), 1145 (int64_t)if_extra, (int64_t)(nb + extra), ngood, 1146 (ngood ? (int64_t)(100 - (100 * (nb + extra)) / 1147 (ngood * fs->lfs_ssize)) : 1148 (int64_t)0)); 1149 if (nb + extra >= ngood * fs->lfs_ssize) 1150 syslog(LOG_WARNING, "%s: cleaner not making forward progress", 1151 fs->lfs_fsmnt); 1152 1153 /* 1154 * Finally call reclaim to prompt cleaning of the segments. 1155 */ 1156 fcntl(fs->clfs_ifilefd, LFCNRECLAIM, NULL); 1157 1158 fd_release_all(fs->clfs_devvp); 1159 return 0; 1160 } 1161 1162 /* 1163 * Read the cleanerinfo block and apply cleaning policy to determine whether 1164 * the given filesystem needs to be cleaned. Returns 1 if it does, 0 if it 1165 * does not, or -1 on error. 1166 */ 1167 int 1168 needs_cleaning(struct clfs *fs, CLEANERINFO *cip) 1169 { 1170 struct ubuf *bp; 1171 struct stat st; 1172 daddr_t fsb_per_seg, max_free_segs; 1173 time_t now; 1174 double loadavg; 1175 1176 /* If this fs is "on hold", don't clean it. */ 1177 if (fs->clfs_onhold) 1178 return 0; 1179 1180 /* 1181 * Read the cleanerinfo block from the Ifile. We don't want 1182 * the cached information, so invalidate the buffer before 1183 * handing it back. 1184 */ 1185 if (bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp)) { 1186 syslog(LOG_ERR, "%s: can't read inode", fs->lfs_fsmnt); 1187 return -1; 1188 } 1189 *cip = *(CLEANERINFO *)bp->b_data; /* Structure copy */ 1190 bp->b_flags |= B_INVAL; 1191 brelse(bp); 1192 cleaner_stats.bytes_read += fs->lfs_bsize; 1193 1194 /* 1195 * If the number of segments changed under us, reinit. 1196 * We don't have to start over from scratch, however, 1197 * since we don't hold any buffers. 1198 */ 1199 if (fs->lfs_nseg != cip->clean + cip->dirty) { 1200 if (reinit_fs(fs) < 0) { 1201 /* The normal case for unmount */ 1202 syslog(LOG_NOTICE, "%s: filesystem unmounted", fs->lfs_fsmnt); 1203 return -1; 1204 } 1205 syslog(LOG_NOTICE, "%s: nsegs changed", fs->lfs_fsmnt); 1206 } 1207 1208 /* Compute theoretical "free segments" maximum based on usage */ 1209 fsb_per_seg = segtod(fs, 1); 1210 max_free_segs = MAX(cip->bfree, 0) / fsb_per_seg + fs->lfs_minfreeseg; 1211 1212 dlog("%s: bfree = %d, avail = %d, clean = %d/%d", 1213 fs->lfs_fsmnt, cip->bfree, cip->avail, cip->clean, fs->lfs_nseg); 1214 1215 /* If the writer is waiting on us, clean it */ 1216 if (cip->clean <= fs->lfs_minfreeseg) 1217 return 1; 1218 1219 /* If there are enough segments, don't clean it */ 1220 if (cip->bfree - cip->avail <= fsb_per_seg && 1221 cip->avail > fsb_per_seg) 1222 return 0; 1223 1224 /* If we are in dire straits, clean it */ 1225 if (cip->bfree - cip->avail > fsb_per_seg && 1226 cip->avail <= fsb_per_seg) 1227 return 1; 1228 1229 /* If under busy threshold, clean regardless of load */ 1230 if (cip->clean < max_free_segs * BUSY_LIM) 1231 return 1; 1232 1233 /* Check busy status; clean if idle and under idle limit */ 1234 if (use_fs_idle) { 1235 /* Filesystem idle */ 1236 time(&now); 1237 if (fstat(fs->clfs_ifilefd, &st) < 0) { 1238 syslog(LOG_ERR, "%s: failed to stat ifile", 1239 fs->lfs_fsmnt); 1240 return -1; 1241 } 1242 if (now - st.st_mtime > segwait_timeout && 1243 cip->clean < max_free_segs * IDLE_LIM) 1244 return 1; 1245 } else { 1246 /* CPU idle - use one-minute load avg */ 1247 if (getloadavg(&loadavg, 1) == -1) { 1248 syslog(LOG_ERR, "%s: failed to get load avg", 1249 fs->lfs_fsmnt); 1250 return -1; 1251 } 1252 if (loadavg < load_threshold && 1253 cip->clean < max_free_segs * IDLE_LIM) 1254 return 1; 1255 } 1256 1257 return 0; 1258 } 1259 1260 /* 1261 * Report statistics. If the signal was SIGUSR2, clear the statistics too. 1262 * If the signal was SIGINT, exit. 1263 */ 1264 static void 1265 sig_report(int sig) 1266 { 1267 double avg = 0.0, stddev; 1268 1269 avg = cleaner_stats.util_tot / MAX(cleaner_stats.segs_cleaned, 1.0); 1270 stddev = cleaner_stats.util_sos / MAX(cleaner_stats.segs_cleaned - 1271 avg * avg, 1.0); 1272 syslog(LOG_INFO, "bytes read: %" PRId64, cleaner_stats.bytes_read); 1273 syslog(LOG_INFO, "bytes written: %" PRId64, cleaner_stats.bytes_written); 1274 syslog(LOG_INFO, "segments cleaned: %" PRId64, cleaner_stats.segs_cleaned); 1275 #if 0 1276 /* "Empty segments" is meaningless, since the kernel handles those */ 1277 syslog(LOG_INFO, "empty segments: %" PRId64, cleaner_stats.segs_empty); 1278 #endif 1279 syslog(LOG_INFO, "error segments: %" PRId64, cleaner_stats.segs_error); 1280 syslog(LOG_INFO, "utilization total: %g", cleaner_stats.util_tot); 1281 syslog(LOG_INFO, "utilization sos: %g", cleaner_stats.util_sos); 1282 syslog(LOG_INFO, "utilization avg: %4.2f", avg); 1283 syslog(LOG_INFO, "utilization sdev: %9.6f", stddev); 1284 1285 if (debug) 1286 bufstats(); 1287 1288 if (sig == SIGUSR2) 1289 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1290 if (sig == SIGINT) 1291 exit(0); 1292 } 1293 1294 static void 1295 sig_exit(int sig) 1296 { 1297 exit(0); 1298 } 1299 1300 static void 1301 usage(void) 1302 { 1303 errx(1, "usage: lfs_cleanerd [-bcdfmqs] [-i segnum] [-l load] " 1304 "[-n nsegs] [-r report_freq] [-t timeout] fs_name ..."); 1305 } 1306 1307 /* 1308 * Main. 1309 */ 1310 int 1311 main(int argc, char **argv) 1312 { 1313 int i, opt, error, r, loopcount; 1314 struct timeval tv; 1315 CLEANERINFO ci; 1316 #ifndef USE_CLIENT_SERVER 1317 char *cp, *pidname; 1318 #endif 1319 1320 /* 1321 * Set up defaults 1322 */ 1323 atatime = 1; 1324 segwait_timeout = 300; /* Five minutes */ 1325 load_threshold = 0.2; 1326 stat_report = 0; 1327 inval_segment = -1; 1328 copylog_filename = NULL; 1329 1330 /* 1331 * Parse command-line arguments 1332 */ 1333 while ((opt = getopt(argc, argv, "bC:cdfi:l:mn:qr:st:")) != -1) { 1334 switch (opt) { 1335 case 'b': /* Use bytes written, not segments read */ 1336 use_bytes = 1; 1337 break; 1338 case 'C': /* copy log */ 1339 copylog_filename = optarg; 1340 break; 1341 case 'c': /* Coalesce files */ 1342 do_coalesce++; 1343 break; 1344 case 'd': /* Debug mode. */ 1345 debug++; 1346 break; 1347 case 'f': /* Use fs idle time rather than cpu idle */ 1348 use_fs_idle = 1; 1349 break; 1350 case 'i': /* Invalidate this segment */ 1351 inval_segment = atoi(optarg); 1352 break; 1353 case 'l': /* Load below which to clean */ 1354 load_threshold = atof(optarg); 1355 break; 1356 case 'm': /* [compat only] */ 1357 break; 1358 case 'n': /* How many segs to clean at once */ 1359 atatime = atoi(optarg); 1360 break; 1361 case 'q': /* Quit after one run */ 1362 do_quit = 1; 1363 break; 1364 case 'r': /* Report every stat_report segments */ 1365 stat_report = atoi(optarg); 1366 break; 1367 case 's': /* Small writes */ 1368 do_small = 1; 1369 break; 1370 case 't': /* timeout */ 1371 segwait_timeout = atoi(optarg); 1372 break; 1373 default: 1374 usage(); 1375 /* NOTREACHED */ 1376 } 1377 } 1378 argc -= optind; 1379 argv += optind; 1380 1381 if (argc < 1) 1382 usage(); 1383 if (inval_segment >= 0 && argc != 1) { 1384 errx(1, "lfs_cleanerd: may only specify one filesystem when " 1385 "using -i flag"); 1386 } 1387 1388 /* 1389 * Set up daemon mode or verbose debug mode 1390 */ 1391 if (debug) { 1392 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID | LOG_PERROR, 1393 LOG_DAEMON); 1394 signal(SIGINT, sig_report); 1395 } else { 1396 if (daemon(0, 0) == -1) 1397 err(1, "lfs_cleanerd: couldn't become a daemon!"); 1398 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID, LOG_DAEMON); 1399 signal(SIGINT, sig_exit); 1400 } 1401 1402 /* 1403 * Look for an already-running master daemon. If there is one, 1404 * send it our filesystems to add to its list and exit. 1405 * If there is none, become the master. 1406 */ 1407 #ifdef USE_CLIENT_SERVER 1408 try_to_become_master(argc, argv); 1409 #else 1410 /* XXX think about this */ 1411 asprintf(&pidname, "lfs_cleanerd:m:%s", argv[0]); 1412 if (pidname == NULL) { 1413 syslog(LOG_ERR, "malloc failed: %m"); 1414 exit(1); 1415 } 1416 for (cp = pidname; cp != NULL; cp = strchr(cp, '/')) 1417 *cp = '|'; 1418 pidfile(pidname); 1419 #endif 1420 1421 /* 1422 * Signals mean daemon should report its statistics 1423 */ 1424 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1425 signal(SIGUSR1, sig_report); 1426 signal(SIGUSR2, sig_report); 1427 1428 /* 1429 * Start up buffer cache. We only use this for the Ifile, 1430 * and we will resize it if necessary, so it can start small. 1431 */ 1432 bufinit(4); 1433 1434 #ifdef REPAIR_ZERO_FINFO 1435 { 1436 BLOCK_INFO *bip = NULL; 1437 int bic = 0; 1438 1439 nfss = 1; 1440 fsp = (struct clfs **)malloc(sizeof(*fsp)); 1441 fsp[0] = (struct clfs *)calloc(1, sizeof(**fsp)); 1442 1443 if (init_unmounted_fs(fsp[0], argv[0]) < 0) { 1444 err(1, "init_unmounted_fs"); 1445 } 1446 dlog("Filesystem has %d segments", fsp[0]->lfs_nseg); 1447 for (i = 0; i < fsp[0]->lfs_nseg; i++) { 1448 load_segment(fsp[0], i, &bip, &bic); 1449 bic = 0; 1450 } 1451 exit(0); 1452 } 1453 #endif 1454 1455 /* 1456 * Initialize cleaning structures, open devices, etc. 1457 */ 1458 nfss = argc; 1459 fsp = (struct clfs **)malloc(nfss * sizeof(*fsp)); 1460 if (fsp == NULL) { 1461 syslog(LOG_ERR, "couldn't allocate fs table: %m"); 1462 exit(1); 1463 } 1464 for (i = 0; i < nfss; i++) { 1465 fsp[i] = (struct clfs *)calloc(1, sizeof(**fsp)); 1466 if ((r = init_fs(fsp[i], argv[i])) < 0) { 1467 syslog(LOG_ERR, "%s: couldn't init: error code %d", 1468 argv[i], r); 1469 handle_error(fsp, i); 1470 --i; /* Do the new #i over again */ 1471 } 1472 } 1473 1474 /* 1475 * If asked to coalesce, do so and exit. 1476 */ 1477 if (do_coalesce) { 1478 for (i = 0; i < nfss; i++) 1479 clean_all_inodes(fsp[i]); 1480 exit(0); 1481 } 1482 1483 /* 1484 * If asked to invalidate a segment, do that and exit. 1485 */ 1486 if (inval_segment >= 0) { 1487 invalidate_segment(fsp[0], inval_segment); 1488 exit(0); 1489 } 1490 1491 /* 1492 * Main cleaning loop. 1493 */ 1494 loopcount = 0; 1495 while (nfss > 0) { 1496 int cleaned_one; 1497 do { 1498 #ifdef USE_CLIENT_SERVER 1499 check_control_socket(); 1500 #endif 1501 cleaned_one = 0; 1502 for (i = 0; i < nfss; i++) { 1503 if ((error = needs_cleaning(fsp[i], &ci)) < 0) { 1504 handle_error(fsp, i); 1505 continue; 1506 } 1507 if (error == 0) /* No need to clean */ 1508 continue; 1509 1510 reload_ifile(fsp[i]); 1511 if (clean_fs(fsp[i], &ci) < 0) { 1512 handle_error(fsp, i); 1513 continue; 1514 } 1515 ++cleaned_one; 1516 } 1517 ++loopcount; 1518 if (stat_report && loopcount % stat_report == 0) 1519 sig_report(0); 1520 if (do_quit) 1521 exit(0); 1522 } while(cleaned_one); 1523 tv.tv_sec = segwait_timeout; 1524 tv.tv_usec = 0; 1525 fcntl(fsp[0]->clfs_ifilefd, LFCNSEGWAITALL, &tv); 1526 } 1527 1528 /* NOTREACHED */ 1529 return 0; 1530 } 1531