1 /* $NetBSD: lfs_cleanerd.c,v 1.55 2015/10/15 06:25:04 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * The cleaner daemon for the NetBSD Log-structured File System. 34 * Only tested for use with version 2 LFSs. 35 */ 36 37 #include <sys/syslog.h> 38 #include <sys/param.h> 39 #include <sys/mount.h> 40 #include <sys/stat.h> 41 #include <ufs/lfs/lfs.h> 42 43 #include <assert.h> 44 #include <err.h> 45 #include <errno.h> 46 #include <fcntl.h> 47 #include <semaphore.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <time.h> 53 #include <util.h> 54 55 #include "bufcache.h" 56 #include "vnode.h" 57 #include "lfs_user.h" 58 #include "fdfs.h" 59 #include "cleaner.h" 60 #include "kernelops.h" 61 #include "mount_lfs.h" 62 63 /* 64 * Global variables. 65 */ 66 /* XXX these top few should really be fs-specific */ 67 int use_fs_idle; /* Use fs idle rather than cpu idle time */ 68 int use_bytes; /* Use bytes written rather than segments cleaned */ 69 double load_threshold; /* How idle is idle (CPU idle) */ 70 int atatime; /* How many segments (bytes) to clean at a time */ 71 72 int nfss; /* Number of filesystems monitored by this cleanerd */ 73 struct clfs **fsp; /* Array of extended filesystem structures */ 74 int segwait_timeout; /* Time to wait in lfs_segwait() */ 75 int do_quit; /* Quit after one cleaning loop */ 76 int do_coalesce; /* Coalesce filesystem */ 77 int do_small; /* Use small writes through markv */ 78 char *copylog_filename; /* File to use for fs debugging analysis */ 79 int inval_segment; /* Segment to invalidate */ 80 int stat_report; /* Report statistics for this period of cycles */ 81 int debug; /* Turn on debugging */ 82 struct cleaner_stats { 83 double util_tot; 84 double util_sos; 85 off_t bytes_read; 86 off_t bytes_written; 87 off_t segs_cleaned; 88 off_t segs_empty; 89 off_t segs_error; 90 } cleaner_stats; 91 92 extern u_int32_t cksum(void *, size_t); 93 extern u_int32_t lfs_sb_cksum(struct dlfs *); 94 extern u_int32_t lfs_cksum_part(void *, size_t, u_int32_t); 95 extern int ulfs_getlbns(struct lfs *, struct uvnode *, daddr_t, struct indir *, int *); 96 97 /* Ugh */ 98 #define FSMNT_SIZE MAX(sizeof(((struct dlfs *)0)->dlfs_fsmnt), \ 99 sizeof(((struct dlfs64 *)0)->dlfs_fsmnt)) 100 101 102 /* Compat */ 103 void pwarn(const char *unused, ...) { /* Does nothing */ }; 104 105 /* 106 * Log a message if debugging is turned on. 107 */ 108 void 109 dlog(const char *fmt, ...) 110 { 111 va_list ap; 112 113 if (debug == 0) 114 return; 115 116 va_start(ap, fmt); 117 vsyslog(LOG_DEBUG, fmt, ap); 118 va_end(ap); 119 } 120 121 /* 122 * Remove the specified filesystem from the list, due to its having 123 * become unmounted or other error condition. 124 */ 125 void 126 handle_error(struct clfs **cfsp, int n) 127 { 128 syslog(LOG_NOTICE, "%s: detaching cleaner", lfs_sb_getfsmnt(cfsp[n])); 129 free(cfsp[n]); 130 if (n != nfss - 1) 131 cfsp[n] = cfsp[nfss - 1]; 132 --nfss; 133 } 134 135 /* 136 * Reinitialize a filesystem if, e.g., its size changed. 137 */ 138 int 139 reinit_fs(struct clfs *fs) 140 { 141 char fsname[FSMNT_SIZE]; 142 143 memcpy(fsname, lfs_sb_getfsmnt(fs), sizeof(fsname)); 144 fsname[sizeof(fsname) - 1] = '\0'; 145 146 kops.ko_close(fs->clfs_ifilefd); 147 kops.ko_close(fs->clfs_devfd); 148 fd_reclaim(fs->clfs_devvp); 149 fd_reclaim(fs->lfs_ivnode); 150 free(fs->clfs_dev); 151 free(fs->clfs_segtab); 152 free(fs->clfs_segtabp); 153 154 return init_fs(fs, fsname); 155 } 156 157 #ifdef REPAIR_ZERO_FINFO 158 /* 159 * Use fsck's lfs routines to load the Ifile from an unmounted fs. 160 * We interpret "fsname" as the name of the raw disk device. 161 */ 162 int 163 init_unmounted_fs(struct clfs *fs, char *fsname) 164 { 165 struct lfs *disc_fs; 166 int i; 167 168 fs->clfs_dev = fsname; 169 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDWR)) < 0) { 170 syslog(LOG_ERR, "couldn't open device %s read/write", 171 fs->clfs_dev); 172 return -1; 173 } 174 175 disc_fs = lfs_init(fs->clfs_devfd, 0, 0, 0, 0); 176 177 fs->lfs_dlfs = disc_fs->lfs_dlfs; /* Structure copy */ 178 strncpy(fs->lfs_fsmnt, fsname, MNAMELEN); 179 fs->lfs_ivnode = (struct uvnode *)disc_fs->lfs_ivnode; 180 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize, 181 atatime); 182 183 /* Allocate and clear segtab */ 184 fs->clfs_segtab = (struct clfs_seguse *)malloc(lfs_sb_getnseg(fs) * 185 sizeof(*fs->clfs_segtab)); 186 fs->clfs_segtabp = (struct clfs_seguse **)malloc(lfs_sb_getnseg(fs) * 187 sizeof(*fs->clfs_segtabp)); 188 for (i = 0; i < lfs_sb_getnseg(fs); i++) { 189 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 190 fs->clfs_segtab[i].flags = 0x0; 191 } 192 syslog(LOG_NOTICE, "%s: unmounted cleaner starting", fsname); 193 194 return 0; 195 } 196 #endif 197 198 /* 199 * Set up the file descriptors, including the Ifile descriptor. 200 * If we can't get the Ifile, this is not an LFS (or the kernel is 201 * too old to support the fcntl). 202 * XXX Merge this and init_unmounted_fs, switching on whether 203 * XXX "fsname" is a dir or a char special device. Should 204 * XXX also be able to read unmounted devices out of fstab, the way 205 * XXX fsck does. 206 */ 207 int 208 init_fs(struct clfs *fs, char *fsname) 209 { 210 char mnttmp[FSMNT_SIZE]; 211 struct statvfs sf; 212 int rootfd; 213 int i; 214 void *sbuf; 215 char *bn; 216 217 /* 218 * Get the raw device from the block device. 219 * XXX this is ugly. Is there a way to discover the raw device 220 * XXX for a given mount point? 221 */ 222 if (kops.ko_statvfs(fsname, &sf, ST_WAIT) < 0) 223 return -1; 224 fs->clfs_dev = malloc(strlen(sf.f_mntfromname) + 2); 225 if (fs->clfs_dev == NULL) { 226 syslog(LOG_ERR, "couldn't malloc device name string: %m"); 227 return -1; 228 } 229 bn = strrchr(sf.f_mntfromname, '/'); 230 bn = bn ? bn+1 : sf.f_mntfromname; 231 strlcpy(fs->clfs_dev, sf.f_mntfromname, bn - sf.f_mntfromname + 1); 232 strcat(fs->clfs_dev, "r"); 233 strcat(fs->clfs_dev, bn); 234 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDONLY, 0)) < 0) { 235 syslog(LOG_ERR, "couldn't open device %s for reading", 236 fs->clfs_dev); 237 return -1; 238 } 239 240 /* Find the Ifile and open it */ 241 if ((rootfd = kops.ko_open(fsname, O_RDONLY, 0)) < 0) 242 return -2; 243 if (kops.ko_fcntl(rootfd, LFCNIFILEFH, &fs->clfs_ifilefh) < 0) 244 return -3; 245 if ((fs->clfs_ifilefd = kops.ko_fhopen(&fs->clfs_ifilefh, 246 sizeof(fs->clfs_ifilefh), O_RDONLY)) < 0) 247 return -4; 248 kops.ko_close(rootfd); 249 250 sbuf = malloc(LFS_SBPAD); 251 if (sbuf == NULL) { 252 syslog(LOG_ERR, "couldn't malloc superblock buffer"); 253 return -1; 254 } 255 256 /* Load in the superblock */ 257 if (kops.ko_pread(fs->clfs_devfd, sbuf, LFS_SBPAD, LFS_LABELPAD) < 0) { 258 free(sbuf); 259 return -1; 260 } 261 262 __CTASSERT(sizeof(struct dlfs) == sizeof(struct dlfs64)); 263 memcpy(&fs->lfs_dlfs_u, sbuf, sizeof(struct dlfs)); 264 free(sbuf); 265 266 /* If it is not LFS, complain and exit! */ 267 switch (fs->lfs_dlfs_u.u_32.dlfs_magic) { 268 case LFS_MAGIC: 269 fs->lfs_is64 = false; 270 fs->lfs_dobyteswap = false; 271 break; 272 case LFS_MAGIC_SWAPPED: 273 fs->lfs_is64 = false; 274 fs->lfs_dobyteswap = true; 275 break; 276 case LFS64_MAGIC: 277 fs->lfs_is64 = true; 278 fs->lfs_dobyteswap = false; 279 break; 280 case LFS64_MAGIC_SWAPPED: 281 fs->lfs_is64 = true; 282 fs->lfs_dobyteswap = true; 283 break; 284 default: 285 syslog(LOG_ERR, "%s: not LFS", fsname); 286 return -1; 287 } 288 /* XXX: can this ever need to be set? does the cleaner even care? */ 289 fs->lfs_hasolddirfmt = 0; 290 291 /* If this is not a version 2 filesystem, complain and exit */ 292 if (lfs_sb_getversion(fs) != 2) { 293 syslog(LOG_ERR, "%s: not a version 2 LFS", fsname); 294 return -1; 295 } 296 297 /* Assume fsname is the mounted name */ 298 strncpy(mnttmp, fsname, sizeof(mnttmp)); 299 mnttmp[sizeof(mnttmp) - 1] = '\0'; 300 lfs_sb_setfsmnt(fs, mnttmp); 301 302 /* Set up vnodes for Ifile and raw device */ 303 fs->lfs_ivnode = fd_vget(fs->clfs_ifilefd, lfs_sb_getbsize(fs), 0, 0); 304 fs->clfs_devvp = fd_vget(fs->clfs_devfd, lfs_sb_getfsize(fs), lfs_sb_getssize(fs), 305 atatime); 306 307 /* Allocate and clear segtab */ 308 fs->clfs_segtab = (struct clfs_seguse *)malloc(lfs_sb_getnseg(fs) * 309 sizeof(*fs->clfs_segtab)); 310 fs->clfs_segtabp = (struct clfs_seguse **)malloc(lfs_sb_getnseg(fs) * 311 sizeof(*fs->clfs_segtabp)); 312 if (fs->clfs_segtab == NULL || fs->clfs_segtabp == NULL) { 313 syslog(LOG_ERR, "%s: couldn't malloc segment table: %m", 314 fs->clfs_dev); 315 return -1; 316 } 317 318 for (i = 0; i < lfs_sb_getnseg(fs); i++) { 319 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]); 320 fs->clfs_segtab[i].flags = 0x0; 321 } 322 323 syslog(LOG_NOTICE, "%s: attaching cleaner", fsname); 324 return 0; 325 } 326 327 /* 328 * Invalidate all the currently held Ifile blocks so they will be 329 * reread when we clean. Check the size while we're at it, and 330 * resize the buffer cache if necessary. 331 */ 332 void 333 reload_ifile(struct clfs *fs) 334 { 335 struct ubuf *bp; 336 struct stat st; 337 int ohashmax; 338 extern int hashmax; 339 340 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) != NULL) { 341 bremfree(bp); 342 buf_destroy(bp); 343 } 344 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_cleanblkhd)) != NULL) { 345 bremfree(bp); 346 buf_destroy(bp); 347 } 348 349 /* If Ifile is larger than buffer cache, rehash */ 350 fstat(fs->clfs_ifilefd, &st); 351 if (st.st_size / lfs_sb_getbsize(fs) > hashmax) { 352 ohashmax = hashmax; 353 bufrehash(st.st_size / lfs_sb_getbsize(fs)); 354 dlog("%s: resized buffer hash from %d to %d", 355 lfs_sb_getfsmnt(fs), ohashmax, hashmax); 356 } 357 } 358 359 /* 360 * Get IFILE entry for the given inode, store in ifpp. The buffer 361 * which contains that data is returned in bpp, and must be brelse()d 362 * by the caller. 363 * 364 * XXX this is cutpaste of LFS_IENTRY from lfs.h; unify the two. 365 */ 366 void 367 lfs_ientry(IFILE **ifpp, struct clfs *fs, ino_t ino, struct ubuf **bpp) 368 { 369 IFILE64 *ifp64; 370 IFILE32 *ifp32; 371 IFILE_V1 *ifp_v1; 372 int error; 373 374 error = bread(fs->lfs_ivnode, 375 ino / lfs_sb_getifpb(fs) + lfs_sb_getcleansz(fs) + 376 lfs_sb_getsegtabsz(fs), lfs_sb_getbsize(fs), 0, bpp); 377 if (error) 378 syslog(LOG_ERR, "%s: ientry failed for ino %d", 379 lfs_sb_getfsmnt(fs), (int)ino); 380 if (fs->lfs_is64) { 381 ifp64 = (IFILE64 *)(*bpp)->b_data; 382 ifp64 += ino % lfs_sb_getifpb(fs); 383 *ifpp = (IFILE *)ifp64; 384 } else if (lfs_sb_getversion(fs) > 1) { 385 ifp32 = (IFILE32 *)(*bpp)->b_data; 386 ifp32 += ino % lfs_sb_getifpb(fs); 387 *ifpp = (IFILE *)ifp32; 388 } else { 389 ifp_v1 = (IFILE_V1 *)(*bpp)->b_data; 390 ifp_v1 += ino % lfs_sb_getifpb(fs); 391 *ifpp = (IFILE *)ifp_v1; 392 } 393 return; 394 } 395 396 #ifdef TEST_PATTERN 397 /* 398 * Check ULFS_ROOTINO for file data. The assumption is that we are running 399 * the "twofiles" test with the rest of the filesystem empty. Files 400 * created by "twofiles" match the test pattern, but ULFS_ROOTINO and the 401 * executable itself (assumed to be inode 3) should not match. 402 */ 403 static void 404 check_test_pattern(BLOCK_INFO *bip) 405 { 406 int j; 407 unsigned char *cp = bip->bi_bp; 408 409 /* Check inode sanity */ 410 if (bip->bi_lbn == LFS_UNUSED_LBN) { 411 assert(((struct ulfs1_dinode *)bip->bi_bp)->di_inumber == 412 bip->bi_inode); 413 } 414 415 /* These can have the test pattern and it's all good */ 416 if (bip->bi_inode > 3) 417 return; 418 419 for (j = 0; j < bip->bi_size; j++) { 420 if (cp[j] != (j & 0xff)) 421 break; 422 } 423 assert(j < bip->bi_size); 424 } 425 #endif /* TEST_PATTERN */ 426 427 /* 428 * Parse the partial segment at daddr, adding its information to 429 * bip. Return the address of the next partial segment to read. 430 */ 431 static daddr_t 432 parse_pseg(struct clfs *fs, daddr_t daddr, BLOCK_INFO **bipp, int *bic) 433 { 434 SEGSUM *ssp; 435 IFILE *ifp; 436 BLOCK_INFO *bip, *nbip; 437 daddr_t idaddr, odaddr; 438 FINFO *fip; 439 IINFO *iip; 440 struct ubuf *ifbp; 441 union lfs_dinode *dip; 442 u_int32_t ck, vers; 443 int fic, inoc, obic; 444 size_t sumstart; 445 int i; 446 char *cp; 447 448 odaddr = daddr; 449 obic = *bic; 450 bip = *bipp; 451 452 /* 453 * Retrieve the segment header, set up the SEGSUM pointer 454 * as well as the first FINFO and inode address pointer. 455 */ 456 cp = fd_ptrget(fs->clfs_devvp, daddr); 457 ssp = (SEGSUM *)cp; 458 iip = SEGSUM_IINFOSTART(fs, cp); 459 fip = SEGSUM_FINFOBASE(fs, cp); 460 461 /* 462 * Check segment header magic and checksum 463 */ 464 if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) { 465 syslog(LOG_WARNING, "%s: sumsum magic number bad at 0x%jx:" 466 " read 0x%x, expected 0x%x", lfs_sb_getfsmnt(fs), 467 (intmax_t)daddr, lfs_ss_getmagic(fs, ssp), SS_MAGIC); 468 return 0x0; 469 } 470 sumstart = lfs_ss_getsumstart(fs); 471 ck = cksum((char *)ssp + sumstart, lfs_sb_getsumsize(fs) - sumstart); 472 if (ck != lfs_ss_getsumsum(fs, ssp)) { 473 syslog(LOG_WARNING, "%s: sumsum checksum mismatch at 0x%jx:" 474 " read 0x%x, computed 0x%x", lfs_sb_getfsmnt(fs), 475 (intmax_t)daddr, lfs_ss_getsumsum(fs, ssp), ck); 476 return 0x0; 477 } 478 479 /* Initialize data sum */ 480 ck = 0; 481 482 /* Point daddr at next block after segment summary */ 483 ++daddr; 484 485 /* 486 * Loop over file info and inode pointers. We always move daddr 487 * forward here because we are also computing the data checksum 488 * as we go. 489 */ 490 fic = inoc = 0; 491 while (fic < lfs_ss_getnfinfo(fs, ssp) || inoc < lfs_ss_getninos(fs, ssp)) { 492 /* 493 * We must have either a file block or an inode block. 494 * If we don't have either one, it's an error. 495 */ 496 if (fic >= lfs_ss_getnfinfo(fs, ssp) && lfs_ii_getblock(fs, iip) != daddr) { 497 syslog(LOG_WARNING, "%s: bad pseg at %jx (seg %d)", 498 lfs_sb_getfsmnt(fs), (intmax_t)odaddr, lfs_dtosn(fs, odaddr)); 499 *bipp = bip; 500 return 0x0; 501 } 502 503 /* 504 * Note each inode from the inode blocks 505 */ 506 if (inoc < lfs_ss_getninos(fs, ssp) && lfs_ii_getblock(fs, iip) == daddr) { 507 cp = fd_ptrget(fs->clfs_devvp, daddr); 508 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 509 for (i = 0; i < lfs_sb_getinopb(fs); i++) { 510 dip = DINO_IN_BLOCK(fs, cp, i); 511 if (lfs_dino_getinumber(fs, dip) == 0) 512 break; 513 514 /* 515 * Check currency before adding it 516 */ 517 #ifndef REPAIR_ZERO_FINFO 518 lfs_ientry(&ifp, fs, lfs_dino_getinumber(fs, dip), &ifbp); 519 idaddr = lfs_if_getdaddr(fs, ifp); 520 brelse(ifbp, 0); 521 if (idaddr != daddr) 522 #endif 523 continue; 524 525 /* 526 * A current inode. Add it. 527 */ 528 ++*bic; 529 nbip = (BLOCK_INFO *)realloc(bip, *bic * 530 sizeof(*bip)); 531 if (nbip) 532 bip = nbip; 533 else { 534 --*bic; 535 *bipp = bip; 536 return 0x0; 537 } 538 bip[*bic - 1].bi_inode = lfs_dino_getinumber(fs, dip); 539 bip[*bic - 1].bi_lbn = LFS_UNUSED_LBN; 540 bip[*bic - 1].bi_daddr = daddr; 541 bip[*bic - 1].bi_segcreate = lfs_ss_getcreate(fs, ssp); 542 bip[*bic - 1].bi_version = lfs_dino_getgen(fs, dip); 543 bip[*bic - 1].bi_bp = dip; 544 bip[*bic - 1].bi_size = DINOSIZE(fs); 545 } 546 inoc += i; 547 daddr += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 548 iip = NEXTLOWER_IINFO(fs, iip); 549 continue; 550 } 551 552 /* 553 * Note each file block from the finfo blocks 554 */ 555 if (fic >= lfs_ss_getnfinfo(fs, ssp)) 556 continue; 557 558 /* Count this finfo, whether or not we use it */ 559 ++fic; 560 561 /* 562 * If this finfo has nblocks==0, it was written wrong. 563 * Kernels with this problem always wrote this zero-sized 564 * finfo last, so just ignore it. 565 */ 566 if (lfs_fi_getnblocks(fs, fip) == 0) { 567 #ifdef REPAIR_ZERO_FINFO 568 struct ubuf *nbp; 569 SEGSUM *nssp; 570 571 syslog(LOG_WARNING, "fixing short FINFO at %jx (seg %d)", 572 (intmax_t)odaddr, lfs_dtosn(fs, odaddr)); 573 bread(fs->clfs_devvp, odaddr, lfs_sb_getfsize(fs), 574 0, &nbp); 575 nssp = (SEGSUM *)nbp->b_data; 576 --nssp->ss_nfinfo; 577 nssp->ss_sumsum = cksum(&nssp->ss_datasum, 578 lfs_sb_getsumsize(fs) - sizeof(nssp->ss_sumsum)); 579 bwrite(nbp); 580 #endif 581 syslog(LOG_WARNING, "zero-length FINFO at %jx (seg %d)", 582 (intmax_t)odaddr, lfs_dtosn(fs, odaddr)); 583 continue; 584 } 585 586 /* 587 * Check currency before adding blocks 588 */ 589 #ifdef REPAIR_ZERO_FINFO 590 vers = -1; 591 #else 592 lfs_ientry(&ifp, fs, lfs_fi_getino(fs, fip), &ifbp); 593 vers = lfs_if_getversion(fs, ifp); 594 brelse(ifbp, 0); 595 #endif 596 if (vers != lfs_fi_getversion(fs, fip)) { 597 size_t size; 598 599 /* Read all the blocks from the data summary */ 600 for (i = 0; i < lfs_fi_getnblocks(fs, fip); i++) { 601 size = (i == lfs_fi_getnblocks(fs, fip) - 1) ? 602 lfs_fi_getlastlength(fs, fip) : lfs_sb_getbsize(fs); 603 cp = fd_ptrget(fs->clfs_devvp, daddr); 604 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 605 daddr += lfs_btofsb(fs, size); 606 } 607 fip = NEXT_FINFO(fs, fip); 608 continue; 609 } 610 611 /* Add all the blocks from the finfos (current or not) */ 612 nbip = (BLOCK_INFO *)realloc(bip, (*bic + lfs_fi_getnblocks(fs, fip)) * 613 sizeof(*bip)); 614 if (nbip) 615 bip = nbip; 616 else { 617 *bipp = bip; 618 return 0x0; 619 } 620 621 for (i = 0; i < lfs_fi_getnblocks(fs, fip); i++) { 622 bip[*bic + i].bi_inode = lfs_fi_getino(fs, fip); 623 bip[*bic + i].bi_lbn = lfs_fi_getblock(fs, fip, i); 624 bip[*bic + i].bi_daddr = daddr; 625 bip[*bic + i].bi_segcreate = lfs_ss_getcreate(fs, ssp); 626 bip[*bic + i].bi_version = lfs_fi_getversion(fs, fip); 627 bip[*bic + i].bi_size = (i == lfs_fi_getnblocks(fs, fip) - 1) ? 628 lfs_fi_getlastlength(fs, fip) : lfs_sb_getbsize(fs); 629 cp = fd_ptrget(fs->clfs_devvp, daddr); 630 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck); 631 bip[*bic + i].bi_bp = cp; 632 daddr += lfs_btofsb(fs, bip[*bic + i].bi_size); 633 634 #ifdef TEST_PATTERN 635 check_test_pattern(bip + *bic + i); /* XXXDEBUG */ 636 #endif 637 } 638 *bic += lfs_fi_getnblocks(fs, fip); 639 fip = NEXT_FINFO(fs, fip); 640 } 641 642 #ifndef REPAIR_ZERO_FINFO 643 if (lfs_ss_getdatasum(fs, ssp) != ck) { 644 syslog(LOG_WARNING, "%s: data checksum bad at 0x%jx:" 645 " read 0x%x, computed 0x%x", lfs_sb_getfsmnt(fs), 646 (intmax_t)odaddr, 647 lfs_ss_getdatasum(fs, ssp), ck); 648 *bic = obic; 649 return 0x0; 650 } 651 #endif 652 653 *bipp = bip; 654 return daddr; 655 } 656 657 static void 658 log_segment_read(struct clfs *fs, int sn) 659 { 660 FILE *fp; 661 char *cp; 662 663 /* 664 * Write the segment read, and its contents, into a log file in 665 * the current directory. We don't need to log the location of 666 * the segment, since that can be inferred from the segments up 667 * to this point (ss_nextseg field of the previously written segment). 668 * 669 * We can use this info later to reconstruct the filesystem at any 670 * given point in time for analysis, by replaying the log forward 671 * indexed by the segment serial numbers; but it is not suitable 672 * for everyday use since the copylog will be simply enormous. 673 */ 674 cp = fd_ptrget(fs->clfs_devvp, lfs_sntod(fs, sn)); 675 676 fp = fopen(copylog_filename, "ab"); 677 if (fp != NULL) { 678 if (fwrite(cp, (size_t)lfs_sb_getssize(fs), 1, fp) != 1) { 679 perror("writing segment to copy log"); 680 } 681 } 682 fclose(fp); 683 } 684 685 /* 686 * Read a segment to populate the BLOCK_INFO structures. 687 * Return the number of partial segments read and parsed. 688 */ 689 int 690 load_segment(struct clfs *fs, int sn, BLOCK_INFO **bipp, int *bic) 691 { 692 daddr_t daddr; 693 int i, npseg; 694 695 daddr = lfs_sntod(fs, sn); 696 if (daddr < lfs_btofsb(fs, LFS_LABELPAD)) 697 daddr = lfs_btofsb(fs, LFS_LABELPAD); 698 for (i = 0; i < LFS_MAXNUMSB; i++) { 699 if (lfs_sb_getsboff(fs, i) == daddr) { 700 daddr += lfs_btofsb(fs, LFS_SBPAD); 701 break; 702 } 703 } 704 705 /* Preload the segment buffer */ 706 if (fd_preload(fs->clfs_devvp, lfs_sntod(fs, sn)) < 0) 707 return -1; 708 709 if (copylog_filename) 710 log_segment_read(fs, sn); 711 712 /* Note bytes read for stats */ 713 cleaner_stats.segs_cleaned++; 714 cleaner_stats.bytes_read += lfs_sb_getssize(fs); 715 ++fs->clfs_nactive; 716 717 npseg = 0; 718 while(lfs_dtosn(fs, daddr) == sn && 719 lfs_dtosn(fs, daddr + lfs_btofsb(fs, lfs_sb_getbsize(fs))) == sn) { 720 daddr = parse_pseg(fs, daddr, bipp, bic); 721 if (daddr == 0x0) { 722 ++cleaner_stats.segs_error; 723 break; 724 } 725 ++npseg; 726 } 727 728 return npseg; 729 } 730 731 void 732 calc_cb(struct clfs *fs, int sn, struct clfs_seguse *t) 733 { 734 time_t now; 735 int64_t age, benefit, cost; 736 737 time(&now); 738 age = (now < t->lastmod ? 0 : now - t->lastmod); 739 740 /* Under no circumstances clean active or already-clean segments */ 741 if ((t->flags & SEGUSE_ACTIVE) || !(t->flags & SEGUSE_DIRTY)) { 742 t->priority = 0; 743 return; 744 } 745 746 /* 747 * If the segment is empty, there is no reason to clean it. 748 * Clear its error condition, if any, since we are never going to 749 * try to parse this one. 750 */ 751 if (t->nbytes == 0) { 752 t->flags &= ~SEGUSE_ERROR; /* Strip error once empty */ 753 t->priority = 0; 754 return; 755 } 756 757 if (t->flags & SEGUSE_ERROR) { /* No good if not already empty */ 758 /* No benefit */ 759 t->priority = 0; 760 return; 761 } 762 763 if (t->nbytes > lfs_sb_getssize(fs)) { 764 /* Another type of error */ 765 syslog(LOG_WARNING, "segment %d: bad seguse count %d", 766 sn, t->nbytes); 767 t->flags |= SEGUSE_ERROR; 768 t->priority = 0; 769 return; 770 } 771 772 /* 773 * The non-degenerate case. Use Rosenblum's cost-benefit algorithm. 774 * Calculate the benefit from cleaning this segment (one segment, 775 * minus fragmentation, dirty blocks and a segment summary block) 776 * and weigh that against the cost (bytes read plus bytes written). 777 * We count the summary headers as "dirty" to avoid cleaning very 778 * old and very full segments. 779 */ 780 benefit = (int64_t)lfs_sb_getssize(fs) - t->nbytes - 781 (t->nsums + 1) * lfs_sb_getfsize(fs); 782 if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */ 783 benefit -= (lfs_sb_getbsize(fs) / 2); 784 if (benefit <= 0) { 785 t->priority = 0; 786 return; 787 } 788 789 cost = lfs_sb_getssize(fs) + t->nbytes; 790 t->priority = (256 * benefit * age) / cost; 791 792 return; 793 } 794 795 /* 796 * Comparator for BLOCK_INFO structures. Anything not in one of the segments 797 * we're looking at sorts higher; after that we sort first by inode number 798 * and then by block number (unsigned, i.e., negative sorts higher) *but* 799 * sort inodes before data blocks. 800 */ 801 static int 802 bi_comparator(const void *va, const void *vb) 803 { 804 const BLOCK_INFO *a, *b; 805 806 a = (const BLOCK_INFO *)va; 807 b = (const BLOCK_INFO *)vb; 808 809 /* Check for out-of-place block */ 810 if (a->bi_segcreate == a->bi_daddr && 811 b->bi_segcreate != b->bi_daddr) 812 return -1; 813 if (a->bi_segcreate != a->bi_daddr && 814 b->bi_segcreate == b->bi_daddr) 815 return 1; 816 if (a->bi_size <= 0 && b->bi_size > 0) 817 return 1; 818 if (b->bi_size <= 0 && a->bi_size > 0) 819 return -1; 820 821 /* Check inode number */ 822 if (a->bi_inode != b->bi_inode) 823 return a->bi_inode - b->bi_inode; 824 825 /* Check lbn */ 826 if (a->bi_lbn == LFS_UNUSED_LBN) /* Inodes sort lower than blocks */ 827 return -1; 828 if (b->bi_lbn == LFS_UNUSED_LBN) 829 return 1; 830 if ((u_int64_t)a->bi_lbn > (u_int64_t)b->bi_lbn) 831 return 1; 832 else 833 return -1; 834 835 return 0; 836 } 837 838 /* 839 * Comparator for sort_segments: cost-benefit equation. 840 */ 841 static int 842 cb_comparator(const void *va, const void *vb) 843 { 844 const struct clfs_seguse *a, *b; 845 846 a = *(const struct clfs_seguse * const *)va; 847 b = *(const struct clfs_seguse * const *)vb; 848 return a->priority > b->priority ? -1 : 1; 849 } 850 851 void 852 toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, blkcnt_t *bic, int *sizep) 853 { 854 blkcnt_t i; 855 int r; 856 BLOCK_INFO *bip = *bipp; 857 struct lfs_fcntl_markv /* { 858 BLOCK_INFO *blkiov; 859 int blkcnt; 860 } */ lim; 861 862 if (bic == 0 || bip == NULL) 863 return; 864 865 /* 866 * Kludge: Store the disk address in segcreate so we know which 867 * ones to toss. 868 */ 869 for (i = 0; i < *bic; i++) 870 bip[i].bi_segcreate = bip[i].bi_daddr; 871 872 /* 873 * XXX: blkcnt_t is 64 bits, so *bic might overflow size_t 874 * (the argument type of heapsort's number argument) on a 875 * 32-bit platform. However, if so we won't have got this far 876 * because we'll have failed trying to allocate the array. So 877 * while *bic here might cause a 64->32 truncation, it's safe. 878 */ 879 /* Sort the blocks */ 880 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 881 882 /* Use bmapv to locate the blocks */ 883 lim.blkiov = bip; 884 lim.blkcnt = *bic; 885 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim)) < 0) { 886 syslog(LOG_WARNING, "%s: bmapv returned %d (%m)", 887 lfs_sb_getfsmnt(fs), r); 888 return; 889 } 890 891 /* Toss blocks not in this segment */ 892 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator); 893 894 /* Get rid of stale blocks */ 895 if (sizep) 896 *sizep = 0; 897 for (i = 0; i < *bic; i++) { 898 if (bip[i].bi_segcreate != bip[i].bi_daddr) 899 break; 900 if (sizep) 901 *sizep += bip[i].bi_size; 902 } 903 *bic = i; /* XXX should we shrink bip? */ 904 *bipp = bip; 905 906 return; 907 } 908 909 /* 910 * Clean a segment and mark it invalid. 911 */ 912 int 913 invalidate_segment(struct clfs *fs, int sn) 914 { 915 BLOCK_INFO *bip; 916 int i, r, bic; 917 blkcnt_t widebic; 918 off_t nb; 919 double util; 920 struct lfs_fcntl_markv /* { 921 BLOCK_INFO *blkiov; 922 int blkcnt; 923 } */ lim; 924 925 dlog("%s: inval seg %d", lfs_sb_getfsmnt(fs), sn); 926 927 bip = NULL; 928 bic = 0; 929 fs->clfs_nactive = 0; 930 if (load_segment(fs, sn, &bip, &bic) <= 0) 931 return -1; 932 widebic = bic; 933 toss_old_blocks(fs, &bip, &widebic, NULL); 934 bic = widebic; 935 936 /* Record statistics */ 937 for (i = nb = 0; i < bic; i++) 938 nb += bip[i].bi_size; 939 util = ((double)nb) / (fs->clfs_nactive * lfs_sb_getssize(fs)); 940 cleaner_stats.util_tot += util; 941 cleaner_stats.util_sos += util * util; 942 cleaner_stats.bytes_written += nb; 943 944 /* 945 * Use markv to move the blocks. 946 */ 947 lim.blkiov = bip; 948 lim.blkcnt = bic; 949 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) { 950 syslog(LOG_WARNING, "%s: markv returned %d (%m) " 951 "for seg %d", lfs_sb_getfsmnt(fs), r, sn); 952 return r; 953 } 954 955 /* 956 * Finally call invalidate to invalidate the segment. 957 */ 958 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNINVAL, &sn)) < 0) { 959 syslog(LOG_WARNING, "%s: inval returned %d (%m) " 960 "for seg %d", lfs_sb_getfsmnt(fs), r, sn); 961 return r; 962 } 963 964 return 0; 965 } 966 967 /* 968 * Check to see if the given ino/lbn pair is represented in the BLOCK_INFO 969 * array we are sending to the kernel, or if the kernel will have to add it. 970 * The kernel will only add each such pair once, though, so keep track of 971 * previous requests in a separate "extra" BLOCK_INFO array. Returns 1 972 * if the block needs to be added, 0 if it is already represented. 973 */ 974 static int 975 check_or_add(ino_t ino, daddr_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp) 976 { 977 BLOCK_INFO *t, *ebip = *ebipp; 978 int ebic = *ebicp; 979 int k; 980 981 for (k = 0; k < bic; k++) { 982 if (bip[k].bi_inode != ino) 983 break; 984 if (bip[k].bi_lbn == lbn) { 985 return 0; 986 } 987 } 988 989 /* Look on the list of extra blocks, too */ 990 for (k = 0; k < ebic; k++) { 991 if (ebip[k].bi_inode == ino && ebip[k].bi_lbn == lbn) { 992 return 0; 993 } 994 } 995 996 ++ebic; 997 t = realloc(ebip, ebic * sizeof(BLOCK_INFO)); 998 if (t == NULL) 999 return 1; /* Note *ebicp is unchanged */ 1000 1001 ebip = t; 1002 ebip[ebic - 1].bi_inode = ino; 1003 ebip[ebic - 1].bi_lbn = lbn; 1004 1005 *ebipp = ebip; 1006 *ebicp = ebic; 1007 return 1; 1008 } 1009 1010 /* 1011 * Look for indirect blocks we will have to write which are not 1012 * contained in this collection of blocks. This constitutes 1013 * a hidden cleaning cost, since we are unaware of it until we 1014 * have already read the segments. Return the total cost, and fill 1015 * in *ifc with the part of that cost due to rewriting the Ifile. 1016 */ 1017 static off_t 1018 check_hidden_cost(struct clfs *fs, BLOCK_INFO *bip, int bic, off_t *ifc) 1019 { 1020 int start; 1021 struct indir in[ULFS_NIADDR + 1]; 1022 int num; 1023 int i, j, ebic; 1024 BLOCK_INFO *ebip; 1025 daddr_t lbn; 1026 1027 start = 0; 1028 ebip = NULL; 1029 ebic = 0; 1030 for (i = 0; i < bic; i++) { 1031 if (i == 0 || bip[i].bi_inode != bip[start].bi_inode) { 1032 start = i; 1033 /* 1034 * Look for IFILE blocks, unless this is the Ifile. 1035 */ 1036 if (bip[i].bi_inode != LFS_IFILE_INUM) { 1037 lbn = lfs_sb_getcleansz(fs) + bip[i].bi_inode / 1038 lfs_sb_getifpb(fs); 1039 *ifc += check_or_add(LFS_IFILE_INUM, lbn, 1040 bip, bic, &ebip, &ebic); 1041 } 1042 } 1043 if (bip[i].bi_lbn == LFS_UNUSED_LBN) 1044 continue; 1045 if (bip[i].bi_lbn < ULFS_NDADDR) 1046 continue; 1047 1048 /* XXX the struct lfs cast is completely wrong/unsafe */ 1049 ulfs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num); 1050 for (j = 0; j < num; j++) { 1051 check_or_add(bip[i].bi_inode, in[j].in_lbn, 1052 bip + start, bic - start, &ebip, &ebic); 1053 } 1054 } 1055 return ebic; 1056 } 1057 1058 /* 1059 * Select segments to clean, add blocks from these segments to a cleaning 1060 * list, and send this list through lfs_markv() to move them to new 1061 * locations on disk. 1062 */ 1063 static int 1064 clean_fs(struct clfs *fs, const CLEANERINFO64 *cip) 1065 { 1066 int i, j, ngood, sn, bic, r, npos; 1067 blkcnt_t widebic; 1068 int bytes, totbytes; 1069 struct ubuf *bp; 1070 SEGUSE *sup; 1071 static BLOCK_INFO *bip; 1072 struct lfs_fcntl_markv /* { 1073 BLOCK_INFO *blkiov; 1074 int blkcnt; 1075 } */ lim; 1076 int mc; 1077 BLOCK_INFO *mbip; 1078 int inc; 1079 off_t nb; 1080 off_t goal; 1081 off_t extra, if_extra; 1082 double util; 1083 1084 /* Read the segment table into our private structure */ 1085 npos = 0; 1086 for (i = 0; i < lfs_sb_getnseg(fs); i+= lfs_sb_getsepb(fs)) { 1087 bread(fs->lfs_ivnode, 1088 lfs_sb_getcleansz(fs) + i / lfs_sb_getsepb(fs), 1089 lfs_sb_getbsize(fs), 0, &bp); 1090 for (j = 0; j < lfs_sb_getsepb(fs) && i + j < lfs_sb_getnseg(fs); j++) { 1091 sup = ((SEGUSE *)bp->b_data) + j; 1092 fs->clfs_segtab[i + j].nbytes = sup->su_nbytes; 1093 fs->clfs_segtab[i + j].nsums = sup->su_nsums; 1094 fs->clfs_segtab[i + j].lastmod = sup->su_lastmod; 1095 /* Keep error status but renew other flags */ 1096 fs->clfs_segtab[i + j].flags &= SEGUSE_ERROR; 1097 fs->clfs_segtab[i + j].flags |= sup->su_flags; 1098 1099 /* Compute cost-benefit coefficient */ 1100 calc_cb(fs, i + j, fs->clfs_segtab + i + j); 1101 if (fs->clfs_segtab[i + j].priority > 0) 1102 ++npos; 1103 } 1104 brelse(bp, 0); 1105 } 1106 1107 /* Sort segments based on cleanliness, fulness, and condition */ 1108 heapsort(fs->clfs_segtabp, lfs_sb_getnseg(fs), sizeof(struct clfs_seguse *), 1109 cb_comparator); 1110 1111 /* If no segment is cleanable, just return */ 1112 if (fs->clfs_segtabp[0]->priority == 0) { 1113 dlog("%s: no segment cleanable", lfs_sb_getfsmnt(fs)); 1114 return 0; 1115 } 1116 1117 /* Load some segments' blocks into bip */ 1118 bic = 0; 1119 fs->clfs_nactive = 0; 1120 ngood = 0; 1121 if (use_bytes) { 1122 /* Set attainable goal */ 1123 goal = lfs_sb_getssize(fs) * atatime; 1124 if (goal > (cip->clean - 1) * lfs_sb_getssize(fs) / 2) 1125 goal = MAX((cip->clean - 1) * lfs_sb_getssize(fs), 1126 lfs_sb_getssize(fs)) / 2; 1127 1128 dlog("%s: cleaning with goal %" PRId64 1129 " bytes (%d segs clean, %d cleanable)", 1130 lfs_sb_getfsmnt(fs), goal, cip->clean, npos); 1131 syslog(LOG_INFO, "%s: cleaning with goal %" PRId64 1132 " bytes (%d segs clean, %d cleanable)", 1133 lfs_sb_getfsmnt(fs), goal, cip->clean, npos); 1134 totbytes = 0; 1135 for (i = 0; i < lfs_sb_getnseg(fs) && totbytes < goal; i++) { 1136 if (fs->clfs_segtabp[i]->priority == 0) 1137 break; 1138 /* Upper bound on number of segments at once */ 1139 if (ngood * lfs_sb_getssize(fs) > 4 * goal) 1140 break; 1141 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1142 dlog("%s: add seg %d prio %" PRIu64 1143 " containing %ld bytes", 1144 lfs_sb_getfsmnt(fs), sn, fs->clfs_segtabp[i]->priority, 1145 fs->clfs_segtabp[i]->nbytes); 1146 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) { 1147 ++ngood; 1148 widebic = bic; 1149 toss_old_blocks(fs, &bip, &widebic, &bytes); 1150 bic = widebic; 1151 totbytes += bytes; 1152 } else if (r == 0) 1153 fd_release(fs->clfs_devvp); 1154 else 1155 break; 1156 } 1157 } else { 1158 /* Set attainable goal */ 1159 goal = atatime; 1160 if (goal > cip->clean - 1) 1161 goal = MAX(cip->clean - 1, 1); 1162 1163 dlog("%s: cleaning with goal %d segments (%d clean, %d cleanable)", 1164 lfs_sb_getfsmnt(fs), (int)goal, cip->clean, npos); 1165 for (i = 0; i < lfs_sb_getnseg(fs) && ngood < goal; i++) { 1166 if (fs->clfs_segtabp[i]->priority == 0) 1167 break; 1168 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab); 1169 dlog("%s: add seg %d prio %" PRIu64, 1170 lfs_sb_getfsmnt(fs), sn, fs->clfs_segtabp[i]->priority); 1171 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) 1172 ++ngood; 1173 else if (r == 0) 1174 fd_release(fs->clfs_devvp); 1175 else 1176 break; 1177 } 1178 widebic = bic; 1179 toss_old_blocks(fs, &bip, &widebic, NULL); 1180 bic = widebic; 1181 } 1182 1183 /* If there is nothing to do, try again later. */ 1184 if (bic == 0) { 1185 dlog("%s: no blocks to clean in %d cleanable segments", 1186 lfs_sb_getfsmnt(fs), (int)ngood); 1187 fd_release_all(fs->clfs_devvp); 1188 return 0; 1189 } 1190 1191 /* Record statistics */ 1192 for (i = nb = 0; i < bic; i++) 1193 nb += bip[i].bi_size; 1194 util = ((double)nb) / (fs->clfs_nactive * lfs_sb_getssize(fs)); 1195 cleaner_stats.util_tot += util; 1196 cleaner_stats.util_sos += util * util; 1197 cleaner_stats.bytes_written += nb; 1198 1199 /* 1200 * Check out our blocks to see if there are hidden cleaning costs. 1201 * If there are, we might be cleaning ourselves deeper into a hole 1202 * rather than doing anything useful. 1203 * XXX do something about this. 1204 */ 1205 if_extra = 0; 1206 extra = lfs_sb_getbsize(fs) * (off_t)check_hidden_cost(fs, bip, bic, &if_extra); 1207 if_extra *= lfs_sb_getbsize(fs); 1208 1209 /* 1210 * Use markv to move the blocks. 1211 */ 1212 if (do_small) 1213 inc = MAXPHYS / lfs_sb_getbsize(fs) - 1; 1214 else 1215 inc = LFS_MARKV_MAXBLKCNT / 2; 1216 for (mc = 0, mbip = bip; mc < bic; mc += inc, mbip += inc) { 1217 lim.blkiov = mbip; 1218 lim.blkcnt = (bic - mc > inc ? inc : bic - mc); 1219 #ifdef TEST_PATTERN 1220 dlog("checking blocks %d-%d", mc, mc + lim.blkcnt - 1); 1221 for (i = 0; i < lim.blkcnt; i++) { 1222 check_test_pattern(mbip + i); 1223 } 1224 #endif /* TEST_PATTERN */ 1225 dlog("sending blocks %d-%d", mc, mc + lim.blkcnt - 1); 1226 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim))<0) { 1227 int oerrno = errno; 1228 syslog(LOG_WARNING, "%s: markv returned %d (errno %d, %m)", 1229 lfs_sb_getfsmnt(fs), r, errno); 1230 if (oerrno != EAGAIN && oerrno != ESHUTDOWN) { 1231 syslog(LOG_DEBUG, "%s: errno %d, returning", 1232 lfs_sb_getfsmnt(fs), oerrno); 1233 fd_release_all(fs->clfs_devvp); 1234 return r; 1235 } 1236 if (oerrno == ESHUTDOWN) { 1237 syslog(LOG_NOTICE, "%s: filesystem unmounted", 1238 lfs_sb_getfsmnt(fs)); 1239 fd_release_all(fs->clfs_devvp); 1240 return r; 1241 } 1242 } 1243 } 1244 1245 /* 1246 * Report progress (or lack thereof) 1247 */ 1248 syslog(LOG_INFO, "%s: wrote %" PRId64 " dirty + %" 1249 PRId64 " supporting indirect + %" 1250 PRId64 " supporting Ifile = %" 1251 PRId64 " bytes to clean %d segs (%" PRId64 "%% recovery)", 1252 lfs_sb_getfsmnt(fs), (int64_t)nb, (int64_t)(extra - if_extra), 1253 (int64_t)if_extra, (int64_t)(nb + extra), ngood, 1254 (ngood ? (int64_t)(100 - (100 * (nb + extra)) / 1255 (ngood * lfs_sb_getssize(fs))) : 1256 (int64_t)0)); 1257 if (nb + extra >= ngood * lfs_sb_getssize(fs)) 1258 syslog(LOG_WARNING, "%s: cleaner not making forward progress", 1259 lfs_sb_getfsmnt(fs)); 1260 1261 /* 1262 * Finally call reclaim to prompt cleaning of the segments. 1263 */ 1264 kops.ko_fcntl(fs->clfs_ifilefd, LFCNRECLAIM, NULL); 1265 1266 fd_release_all(fs->clfs_devvp); 1267 return 0; 1268 } 1269 1270 /* 1271 * Read the cleanerinfo block and apply cleaning policy to determine whether 1272 * the given filesystem needs to be cleaned. Returns 1 if it does, 0 if it 1273 * does not, or -1 on error. 1274 */ 1275 static int 1276 needs_cleaning(struct clfs *fs, CLEANERINFO64 *cip) 1277 { 1278 CLEANERINFO *cipu; 1279 struct ubuf *bp; 1280 struct stat st; 1281 daddr_t fsb_per_seg, max_free_segs; 1282 time_t now; 1283 double loadavg; 1284 1285 /* If this fs is "on hold", don't clean it. */ 1286 if (fs->clfs_onhold) 1287 return 0; 1288 1289 /* 1290 * Read the cleanerinfo block from the Ifile. We don't want 1291 * the cached information, so invalidate the buffer before 1292 * handing it back. 1293 */ 1294 if (bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp)) { 1295 syslog(LOG_ERR, "%s: can't read inode", lfs_sb_getfsmnt(fs)); 1296 return -1; 1297 } 1298 cipu = (CLEANERINFO *)bp->b_data; 1299 if (fs->lfs_is64) { 1300 /* Structure copy */ 1301 *cip = cipu->u_64; 1302 } else { 1303 /* Copy the fields and promote to 64 bit */ 1304 cip->clean = cipu->u_32.clean; 1305 cip->dirty = cipu->u_32.dirty; 1306 cip->bfree = cipu->u_32.bfree; 1307 cip->avail = cipu->u_32.avail; 1308 cip->free_head = cipu->u_32.free_head; 1309 cip->free_tail = cipu->u_32.free_tail; 1310 cip->flags = cipu->u_32.flags; 1311 } 1312 brelse(bp, B_INVAL); 1313 cleaner_stats.bytes_read += lfs_sb_getbsize(fs); 1314 1315 /* 1316 * If the number of segments changed under us, reinit. 1317 * We don't have to start over from scratch, however, 1318 * since we don't hold any buffers. 1319 */ 1320 if (lfs_sb_getnseg(fs) != cip->clean + cip->dirty) { 1321 if (reinit_fs(fs) < 0) { 1322 /* The normal case for unmount */ 1323 syslog(LOG_NOTICE, "%s: filesystem unmounted", lfs_sb_getfsmnt(fs)); 1324 return -1; 1325 } 1326 syslog(LOG_NOTICE, "%s: nsegs changed", lfs_sb_getfsmnt(fs)); 1327 } 1328 1329 /* Compute theoretical "free segments" maximum based on usage */ 1330 fsb_per_seg = lfs_segtod(fs, 1); 1331 max_free_segs = MAX(cip->bfree, 0) / fsb_per_seg + lfs_sb_getminfreeseg(fs); 1332 1333 dlog("%s: bfree = %d, avail = %d, clean = %d/%d", 1334 lfs_sb_getfsmnt(fs), cip->bfree, cip->avail, cip->clean, 1335 lfs_sb_getnseg(fs)); 1336 1337 /* If the writer is waiting on us, clean it */ 1338 if (cip->clean <= lfs_sb_getminfreeseg(fs) || 1339 (cip->flags & LFS_CLEANER_MUST_CLEAN)) 1340 return 1; 1341 1342 /* If there are enough segments, don't clean it */ 1343 if (cip->bfree - cip->avail <= fsb_per_seg && 1344 cip->avail > fsb_per_seg) 1345 return 0; 1346 1347 /* If we are in dire straits, clean it */ 1348 if (cip->bfree - cip->avail > fsb_per_seg && 1349 cip->avail <= fsb_per_seg) 1350 return 1; 1351 1352 /* If under busy threshold, clean regardless of load */ 1353 if (cip->clean < max_free_segs * BUSY_LIM) 1354 return 1; 1355 1356 /* Check busy status; clean if idle and under idle limit */ 1357 if (use_fs_idle) { 1358 /* Filesystem idle */ 1359 time(&now); 1360 if (fstat(fs->clfs_ifilefd, &st) < 0) { 1361 syslog(LOG_ERR, "%s: failed to stat ifile", 1362 lfs_sb_getfsmnt(fs)); 1363 return -1; 1364 } 1365 if (now - st.st_mtime > segwait_timeout && 1366 cip->clean < max_free_segs * IDLE_LIM) 1367 return 1; 1368 } else { 1369 /* CPU idle - use one-minute load avg */ 1370 if (getloadavg(&loadavg, 1) == -1) { 1371 syslog(LOG_ERR, "%s: failed to get load avg", 1372 lfs_sb_getfsmnt(fs)); 1373 return -1; 1374 } 1375 if (loadavg < load_threshold && 1376 cip->clean < max_free_segs * IDLE_LIM) 1377 return 1; 1378 } 1379 1380 return 0; 1381 } 1382 1383 /* 1384 * Report statistics. If the signal was SIGUSR2, clear the statistics too. 1385 * If the signal was SIGINT, exit. 1386 */ 1387 static void 1388 sig_report(int sig) 1389 { 1390 double avg = 0.0, stddev; 1391 1392 avg = cleaner_stats.util_tot / MAX(cleaner_stats.segs_cleaned, 1.0); 1393 stddev = cleaner_stats.util_sos / MAX(cleaner_stats.segs_cleaned - 1394 avg * avg, 1.0); 1395 syslog(LOG_INFO, "bytes read: %" PRId64, cleaner_stats.bytes_read); 1396 syslog(LOG_INFO, "bytes written: %" PRId64, cleaner_stats.bytes_written); 1397 syslog(LOG_INFO, "segments cleaned: %" PRId64, cleaner_stats.segs_cleaned); 1398 #if 0 1399 /* "Empty segments" is meaningless, since the kernel handles those */ 1400 syslog(LOG_INFO, "empty segments: %" PRId64, cleaner_stats.segs_empty); 1401 #endif 1402 syslog(LOG_INFO, "error segments: %" PRId64, cleaner_stats.segs_error); 1403 syslog(LOG_INFO, "utilization total: %g", cleaner_stats.util_tot); 1404 syslog(LOG_INFO, "utilization sos: %g", cleaner_stats.util_sos); 1405 syslog(LOG_INFO, "utilization avg: %4.2f", avg); 1406 syslog(LOG_INFO, "utilization sdev: %9.6f", stddev); 1407 1408 if (debug) 1409 bufstats(); 1410 1411 if (sig == SIGUSR2) 1412 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1413 if (sig == SIGINT) 1414 exit(0); 1415 } 1416 1417 static void 1418 sig_exit(int sig) 1419 { 1420 exit(0); 1421 } 1422 1423 static void 1424 usage(void) 1425 { 1426 errx(1, "usage: lfs_cleanerd [-bcdfmqs] [-i segnum] [-l load] " 1427 "[-n nsegs] [-r report_freq] [-t timeout] fs_name ..."); 1428 } 1429 1430 #ifndef LFS_CLEANER_AS_LIB 1431 /* 1432 * Main. 1433 */ 1434 int 1435 main(int argc, char **argv) 1436 { 1437 1438 return lfs_cleaner_main(argc, argv); 1439 } 1440 #endif 1441 1442 int 1443 lfs_cleaner_main(int argc, char **argv) 1444 { 1445 int i, opt, error, r, loopcount, nodetach; 1446 struct timeval tv; 1447 #ifdef LFS_CLEANER_AS_LIB 1448 sem_t *semaddr = NULL; 1449 #endif 1450 CLEANERINFO64 ci; 1451 #ifndef USE_CLIENT_SERVER 1452 char *cp, *pidname; 1453 #endif 1454 1455 #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ == 8 && \ 1456 defined(__OPTIMIZE_SIZE__) 1457 /* 1458 * XXX: Work around apparent bug with gcc 4.8 and -Os: it 1459 * claims that ci.clean is uninitialized in clean_fs (at one 1460 * of the several uses of it, which is neither the first nor 1461 * last use) -- this doesn't happen with plain -O2. 1462 * 1463 * Hopefully in the future further rearrangements will allow 1464 * removing this hack. 1465 */ 1466 ci.clean = 0; 1467 #endif 1468 1469 /* 1470 * Set up defaults 1471 */ 1472 atatime = 1; 1473 segwait_timeout = 300; /* Five minutes */ 1474 load_threshold = 0.2; 1475 stat_report = 0; 1476 inval_segment = -1; 1477 copylog_filename = NULL; 1478 nodetach = 0; 1479 1480 /* 1481 * Parse command-line arguments 1482 */ 1483 while ((opt = getopt(argc, argv, "bC:cdDfi:l:mn:qr:sS:t:")) != -1) { 1484 switch (opt) { 1485 case 'b': /* Use bytes written, not segments read */ 1486 use_bytes = 1; 1487 break; 1488 case 'C': /* copy log */ 1489 copylog_filename = optarg; 1490 break; 1491 case 'c': /* Coalesce files */ 1492 do_coalesce++; 1493 break; 1494 case 'd': /* Debug mode. */ 1495 nodetach++; 1496 debug++; 1497 break; 1498 case 'D': /* stay-on-foreground */ 1499 nodetach++; 1500 break; 1501 case 'f': /* Use fs idle time rather than cpu idle */ 1502 use_fs_idle = 1; 1503 break; 1504 case 'i': /* Invalidate this segment */ 1505 inval_segment = atoi(optarg); 1506 break; 1507 case 'l': /* Load below which to clean */ 1508 load_threshold = atof(optarg); 1509 break; 1510 case 'm': /* [compat only] */ 1511 break; 1512 case 'n': /* How many segs to clean at once */ 1513 atatime = atoi(optarg); 1514 break; 1515 case 'q': /* Quit after one run */ 1516 do_quit = 1; 1517 break; 1518 case 'r': /* Report every stat_report segments */ 1519 stat_report = atoi(optarg); 1520 break; 1521 case 's': /* Small writes */ 1522 do_small = 1; 1523 break; 1524 #ifdef LFS_CLEANER_AS_LIB 1525 case 'S': /* semaphore */ 1526 semaddr = (void*)(uintptr_t)strtoull(optarg,NULL,0); 1527 break; 1528 #endif 1529 case 't': /* timeout */ 1530 segwait_timeout = atoi(optarg); 1531 break; 1532 default: 1533 usage(); 1534 /* NOTREACHED */ 1535 } 1536 } 1537 argc -= optind; 1538 argv += optind; 1539 1540 if (argc < 1) 1541 usage(); 1542 if (inval_segment >= 0 && argc != 1) { 1543 errx(1, "lfs_cleanerd: may only specify one filesystem when " 1544 "using -i flag"); 1545 } 1546 1547 if (do_coalesce) { 1548 errx(1, "lfs_cleanerd: -c disabled due to reports of file " 1549 "corruption; you may re-enable it by rebuilding the " 1550 "cleaner"); 1551 } 1552 1553 /* 1554 * Set up daemon mode or foreground mode 1555 */ 1556 if (nodetach) { 1557 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID | LOG_PERROR, 1558 LOG_DAEMON); 1559 signal(SIGINT, sig_report); 1560 } else { 1561 if (daemon(0, 0) == -1) 1562 err(1, "lfs_cleanerd: couldn't become a daemon!"); 1563 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID, LOG_DAEMON); 1564 signal(SIGINT, sig_exit); 1565 } 1566 1567 /* 1568 * Look for an already-running master daemon. If there is one, 1569 * send it our filesystems to add to its list and exit. 1570 * If there is none, become the master. 1571 */ 1572 #ifdef USE_CLIENT_SERVER 1573 try_to_become_master(argc, argv); 1574 #else 1575 /* XXX think about this */ 1576 asprintf(&pidname, "lfs_cleanerd:m:%s", argv[0]); 1577 if (pidname == NULL) { 1578 syslog(LOG_ERR, "malloc failed: %m"); 1579 exit(1); 1580 } 1581 for (cp = pidname; cp != NULL; cp = strchr(cp, '/')) 1582 *cp = '|'; 1583 pidfile(pidname); 1584 #endif 1585 1586 /* 1587 * Signals mean daemon should report its statistics 1588 */ 1589 memset(&cleaner_stats, 0, sizeof(cleaner_stats)); 1590 signal(SIGUSR1, sig_report); 1591 signal(SIGUSR2, sig_report); 1592 1593 /* 1594 * Start up buffer cache. We only use this for the Ifile, 1595 * and we will resize it if necessary, so it can start small. 1596 */ 1597 bufinit(4); 1598 1599 #ifdef REPAIR_ZERO_FINFO 1600 { 1601 BLOCK_INFO *bip = NULL; 1602 int bic = 0; 1603 1604 nfss = 1; 1605 fsp = (struct clfs **)malloc(sizeof(*fsp)); 1606 fsp[0] = (struct clfs *)calloc(1, sizeof(**fsp)); 1607 1608 if (init_unmounted_fs(fsp[0], argv[0]) < 0) { 1609 err(1, "init_unmounted_fs"); 1610 } 1611 dlog("Filesystem has %d segments", fsp[0]->lfs_nseg); 1612 for (i = 0; i < fsp[0]->lfs_nseg; i++) { 1613 load_segment(fsp[0], i, &bip, &bic); 1614 bic = 0; 1615 } 1616 exit(0); 1617 } 1618 #endif 1619 1620 /* 1621 * Initialize cleaning structures, open devices, etc. 1622 */ 1623 nfss = argc; 1624 fsp = (struct clfs **)malloc(nfss * sizeof(*fsp)); 1625 if (fsp == NULL) { 1626 syslog(LOG_ERR, "couldn't allocate fs table: %m"); 1627 exit(1); 1628 } 1629 for (i = 0; i < nfss; i++) { 1630 fsp[i] = (struct clfs *)calloc(1, sizeof(**fsp)); 1631 if ((r = init_fs(fsp[i], argv[i])) < 0) { 1632 syslog(LOG_ERR, "%s: couldn't init: error code %d", 1633 argv[i], r); 1634 handle_error(fsp, i); 1635 --i; /* Do the new #i over again */ 1636 } 1637 } 1638 1639 /* 1640 * If asked to coalesce, do so and exit. 1641 */ 1642 if (do_coalesce) { 1643 for (i = 0; i < nfss; i++) 1644 clean_all_inodes(fsp[i]); 1645 exit(0); 1646 } 1647 1648 /* 1649 * If asked to invalidate a segment, do that and exit. 1650 */ 1651 if (inval_segment >= 0) { 1652 invalidate_segment(fsp[0], inval_segment); 1653 exit(0); 1654 } 1655 1656 /* 1657 * Main cleaning loop. 1658 */ 1659 loopcount = 0; 1660 #ifdef LFS_CLEANER_AS_LIB 1661 if (semaddr) 1662 sem_post(semaddr); 1663 #endif 1664 error = 0; 1665 while (nfss > 0) { 1666 int cleaned_one; 1667 do { 1668 #ifdef USE_CLIENT_SERVER 1669 check_control_socket(); 1670 #endif 1671 cleaned_one = 0; 1672 for (i = 0; i < nfss; i++) { 1673 if ((error = needs_cleaning(fsp[i], &ci)) < 0) { 1674 syslog(LOG_DEBUG, "%s: needs_cleaning returned %d", 1675 getprogname(), error); 1676 handle_error(fsp, i); 1677 continue; 1678 } 1679 if (error == 0) /* No need to clean */ 1680 continue; 1681 1682 reload_ifile(fsp[i]); 1683 if ((error = clean_fs(fsp[i], &ci)) < 0) { 1684 syslog(LOG_DEBUG, "%s: clean_fs returned %d", 1685 getprogname(), error); 1686 handle_error(fsp, i); 1687 continue; 1688 } 1689 ++cleaned_one; 1690 } 1691 ++loopcount; 1692 if (stat_report && loopcount % stat_report == 0) 1693 sig_report(0); 1694 if (do_quit) 1695 exit(0); 1696 } while(cleaned_one); 1697 tv.tv_sec = segwait_timeout; 1698 tv.tv_usec = 0; 1699 /* XXX: why couldn't others work if fsp socket is shutdown? */ 1700 error = kops.ko_fcntl(fsp[0]->clfs_ifilefd,LFCNSEGWAITALL,&tv); 1701 if (error) { 1702 if (errno == ESHUTDOWN) { 1703 for (i = 0; i < nfss; i++) { 1704 syslog(LOG_INFO, "%s: shutdown", 1705 getprogname()); 1706 handle_error(fsp, i); 1707 assert(nfss == 0); 1708 } 1709 } else { 1710 #ifdef LFS_CLEANER_AS_LIB 1711 error = ESHUTDOWN; 1712 break; 1713 #else 1714 err(1, "LFCNSEGWAITALL"); 1715 #endif 1716 } 1717 } 1718 } 1719 1720 /* NOTREACHED */ 1721 return error; 1722 } 1723