1 /* $NetBSD: coalesce.c,v 1.17 2009/03/16 00:08:10 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/mount.h> 34 #include <sys/time.h> 35 #include <sys/resource.h> 36 #include <sys/types.h> 37 #include <sys/wait.h> 38 #include <sys/mman.h> 39 40 #include <ufs/ufs/dinode.h> 41 #include <ufs/lfs/lfs.h> 42 43 #include <fcntl.h> 44 #include <signal.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <string.h> 48 #include <time.h> 49 #include <unistd.h> 50 #include <util.h> 51 #include <errno.h> 52 #include <err.h> 53 54 #include <syslog.h> 55 56 #include "bufcache.h" 57 #include "vnode.h" 58 #include "cleaner.h" 59 60 extern int debug, do_mmap; 61 62 int log2int(int n) 63 { 64 int log; 65 66 log = 0; 67 while (n > 0) { 68 ++log; 69 n >>= 1; 70 } 71 return log - 1; 72 } 73 74 enum coalesce_returncodes { 75 COALESCE_OK = 0, 76 COALESCE_NOINODE, 77 COALESCE_TOOSMALL, 78 COALESCE_BADSIZE, 79 COALESCE_BADBLOCKSIZE, 80 COALESCE_NOMEM, 81 COALESCE_BADBMAPV, 82 COALESCE_BADMARKV, 83 COALESCE_NOTWORTHIT, 84 COALESCE_NOTHINGLEFT, 85 COALESCE_EIO, 86 87 COALESCE_MAXERROR 88 }; 89 90 const char *coalesce_return[] = { 91 "Successfully coalesced", 92 "File not in use or inode not found", 93 "Not large enough to coalesce", 94 "Negative size", 95 "Not enough blocks to account for size", 96 "Malloc failed", 97 "LFCNBMAPV failed", 98 "Not broken enough to fix", 99 "Too many blocks not found", 100 "Too many blocks found in active segments", 101 "I/O error", 102 103 "No such error" 104 }; 105 106 static struct ufs1_dinode * 107 get_dinode(struct clfs *fs, ino_t ino) 108 { 109 IFILE *ifp; 110 daddr_t daddr; 111 struct ubuf *bp; 112 struct ufs1_dinode *dip, *r; 113 114 lfs_ientry(&ifp, fs, ino, &bp); 115 daddr = ifp->if_daddr; 116 brelse(bp, 0); 117 118 if (daddr == 0x0) 119 return NULL; 120 121 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp); 122 for (dip = (struct ufs1_dinode *)bp->b_data; 123 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++) 124 if (dip->di_inumber == ino) { 125 r = (struct ufs1_dinode *)malloc(sizeof(*r)); 126 memcpy(r, dip, sizeof(*r)); 127 brelse(bp, 0); 128 return r; 129 } 130 brelse(bp, 0); 131 return NULL; 132 } 133 134 /* 135 * Find out if this inode's data blocks are discontinuous; if they are, 136 * rewrite them using markv. Return the number of inodes rewritten. 137 */ 138 static int 139 clean_inode(struct clfs *fs, ino_t ino) 140 { 141 BLOCK_INFO *bip = NULL, *tbip; 142 CLEANERINFO cip; 143 struct ubuf *bp; 144 struct ufs1_dinode *dip; 145 struct clfs_seguse *sup; 146 struct lfs_fcntl_markv /* { 147 BLOCK_INFO *blkiov; 148 int blkcnt; 149 } */ lim; 150 daddr_t toff; 151 int i; 152 int nb, onb, noff; 153 int retval; 154 int bps; 155 156 dip = get_dinode(fs, ino); 157 if (dip == NULL) 158 return COALESCE_NOINODE; 159 160 /* Compute file block size, set up for bmapv */ 161 onb = nb = lblkno(fs, dip->di_size); 162 163 /* XXX for now, don't do any file small enough to have fragments */ 164 if (nb < NDADDR) { 165 free(dip); 166 return COALESCE_TOOSMALL; 167 } 168 169 /* Sanity checks */ 170 #if 0 /* di_size is uint64_t -- this is a noop */ 171 if (dip->di_size < 0) { 172 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size); 173 free(dip); 174 return COALESCE_BADSIZE; 175 } 176 #endif 177 if (nb > dip->di_blocks) { 178 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb, 179 dip->di_blocks); 180 free(dip); 181 return COALESCE_BADBLOCKSIZE; 182 } 183 184 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb); 185 if (bip == NULL) { 186 syslog(LOG_WARNING, "ino %llu, %d blocks: %m", 187 (unsigned long long)ino, nb); 188 free(dip); 189 return COALESCE_NOMEM; 190 } 191 for (i = 0; i < nb; i++) { 192 memset(bip + i, 0, sizeof(BLOCK_INFO)); 193 bip[i].bi_inode = ino; 194 bip[i].bi_lbn = i; 195 bip[i].bi_version = dip->di_gen; 196 /* Don't set the size, but let lfs_bmap fill it in */ 197 } 198 lim.blkiov = bip; 199 lim.blkcnt = nb; 200 if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) { 201 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m", 202 fs->lfs_fsmnt); 203 retval = COALESCE_BADBMAPV; 204 goto out; 205 } 206 #if 0 207 for (i = 0; i < nb; i++) { 208 printf("bi_size = %d, bi_ino = %d, " 209 "bi_lbn = %d, bi_daddr = %d\n", 210 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn, 211 bip[i].bi_daddr); 212 } 213 #endif 214 noff = toff = 0; 215 for (i = 1; i < nb; i++) { 216 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag) 217 ++noff; 218 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr 219 - fs->lfs_frag) >> fs->lfs_fbshift; 220 } 221 222 /* 223 * If this file is not discontinuous, there's no point in rewriting it. 224 * 225 * Explicitly allow a certain amount of discontinuity, since large 226 * files will be broken among segments and medium-sized files 227 * can have a break or two and it's okay. 228 */ 229 if (nb <= 1 || noff == 0 || noff < log2int(nb) || 230 segtod(fs, noff) * 2 < nb) { 231 retval = COALESCE_NOTWORTHIT; 232 goto out; 233 } else if (debug) 234 syslog(LOG_DEBUG, "ino %llu total discontinuity " 235 "%d (%lld) for %d blocks", (unsigned long long)ino, 236 noff, (long long)toff, nb); 237 238 /* Search for blocks in active segments; don't move them. */ 239 for (i = 0; i < nb; i++) { 240 if (bip[i].bi_daddr <= 0) 241 continue; 242 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)]; 243 if (sup->flags & SEGUSE_ACTIVE) 244 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */ 245 } 246 247 /* 248 * Get rid of any blocks we've marked dead. If this is an older 249 * kernel that doesn't have bmapv fill in the block sizes, we'll 250 * toss everything here. 251 */ 252 onb = nb; 253 toss_old_blocks(fs, &bip, &nb, NULL); 254 nb = i; 255 256 /* 257 * We may have tossed enough blocks that it is no longer worthwhile 258 * to rewrite this inode. 259 */ 260 if (nb == 0 || onb - nb > log2int(onb)) { 261 if (debug) 262 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting"); 263 retval = COALESCE_NOTHINGLEFT; 264 goto out; 265 } 266 267 /* 268 * We are going to rewrite this inode. 269 * For any remaining blocks, read in their contents. 270 */ 271 for (i = 0; i < nb; i++) { 272 bip[i].bi_bp = malloc(bip[i].bi_size); 273 if (bip[i].bi_bp == NULL) { 274 syslog(LOG_WARNING, "allocate block buffer size=%d: %m", 275 bip[i].bi_size); 276 retval = COALESCE_NOMEM; 277 goto out; 278 } 279 280 if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size, 281 fsbtob(fs, bip[i].bi_daddr)) < 0) { 282 retval = COALESCE_EIO; 283 goto out; 284 } 285 } 286 if (debug) 287 syslog(LOG_DEBUG, "ino %llu markv %d blocks", 288 (unsigned long long)ino, nb); 289 290 /* 291 * Write in segment-sized chunks. If at any point we'd write more 292 * than half of the available segments, sleep until that's not 293 * true any more. 294 */ 295 bps = segtod(fs, 1); 296 for (tbip = bip; tbip < bip + nb; tbip += bps) { 297 do { 298 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp); 299 cip = *(CLEANERINFO *)bp->b_data; 300 brelse(bp, B_INVAL); 301 302 if (cip.clean < 4) /* XXX magic number 4 */ 303 fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL); 304 } while(cip.clean < 4); 305 306 lim.blkiov = tbip; 307 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps); 308 if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) { 309 retval = COALESCE_BADMARKV; 310 goto out; 311 } 312 } 313 314 retval = COALESCE_OK; 315 out: 316 free(dip); 317 if (bip) { 318 for (i = 0; i < onb; i++) 319 if (bip[i].bi_bp) 320 free(bip[i].bi_bp); 321 free(bip); 322 } 323 return retval; 324 } 325 326 /* 327 * Try coalescing every inode in the filesystem. 328 * Return the number of inodes actually altered. 329 */ 330 int clean_all_inodes(struct clfs *fs) 331 { 332 int i, r, maxino; 333 int totals[COALESCE_MAXERROR]; 334 struct stat st; 335 336 memset(totals, 0, sizeof(totals)); 337 338 fstat(fs->clfs_ifilefd, &st); 339 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) - 340 fs->lfs_segtabsz - fs->lfs_cleansz; 341 342 for (i = 0; i < maxino; i++) { 343 r = clean_inode(fs, i); 344 ++totals[r]; 345 } 346 347 for (i = 0; i < COALESCE_MAXERROR; i++) 348 if (totals[i]) 349 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i], 350 totals[i]); 351 352 return totals[COALESCE_OK]; 353 } 354 355 /* 356 * Fork a child process to coalesce this fs. 357 */ 358 int 359 fork_coalesce(struct clfs *fs) 360 { 361 static pid_t childpid; 362 int num; 363 364 /* 365 * If already running a coalescing child, don't start a new one. 366 */ 367 if (childpid) { 368 if (waitpid(childpid, NULL, WNOHANG) == childpid) 369 childpid = 0; 370 } 371 if (childpid && kill(childpid, 0) >= 0) { 372 /* already running a coalesce process */ 373 if (debug) 374 syslog(LOG_DEBUG, "coalescing already in progress"); 375 return 0; 376 } 377 378 /* 379 * Fork a child and let the child coalease 380 */ 381 childpid = fork(); 382 if (childpid < 0) { 383 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt); 384 return 0; 385 } else if (childpid == 0) { 386 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d", 387 fs->lfs_fsmnt, getpid()); 388 num = clean_all_inodes(fs); 389 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes", 390 fs->lfs_fsmnt, num); 391 exit(0); 392 } 393 394 return 0; 395 } 396