1 /* $NetBSD: coalesce.c,v 1.13 2006/05/12 19:35:27 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/mount.h> 41 #include <sys/time.h> 42 #include <sys/resource.h> 43 #include <sys/types.h> 44 #include <sys/wait.h> 45 #include <sys/mman.h> 46 47 #include <ufs/ufs/dinode.h> 48 #include <ufs/lfs/lfs.h> 49 50 #include <fcntl.h> 51 #include <signal.h> 52 #include <stdio.h> 53 #include <stdlib.h> 54 #include <string.h> 55 #include <time.h> 56 #include <unistd.h> 57 #include <util.h> 58 #include <errno.h> 59 #include <err.h> 60 61 #include <syslog.h> 62 63 #include "bufcache.h" 64 #include "vnode.h" 65 #include "cleaner.h" 66 67 extern int debug, do_mmap; 68 69 int log2int(int n) 70 { 71 int log; 72 73 log = 0; 74 while (n > 0) { 75 ++log; 76 n >>= 1; 77 } 78 return log - 1; 79 } 80 81 enum coalesce_returncodes { 82 COALESCE_OK = 0, 83 COALESCE_NOINODE, 84 COALESCE_TOOSMALL, 85 COALESCE_BADSIZE, 86 COALESCE_BADBLOCKSIZE, 87 COALESCE_NOMEM, 88 COALESCE_BADBMAPV, 89 COALESCE_BADMARKV, 90 COALESCE_NOTWORTHIT, 91 COALESCE_NOTHINGLEFT, 92 COALESCE_EIO, 93 94 COALESCE_MAXERROR 95 }; 96 97 char *coalesce_return[] = { 98 "Successfully coalesced", 99 "File not in use or inode not found", 100 "Not large enough to coalesce", 101 "Negative size", 102 "Not enough blocks to account for size", 103 "Malloc failed", 104 "LFCNBMAPV failed", 105 "Not broken enough to fix", 106 "Too many blocks not found", 107 "Too many blocks found in active segments", 108 "I/O error", 109 110 "No such error" 111 }; 112 113 static struct ufs1_dinode * 114 get_dinode(struct clfs *fs, ino_t ino) 115 { 116 IFILE *ifp; 117 daddr_t daddr; 118 struct ubuf *bp; 119 struct ufs1_dinode *dip, *r; 120 121 lfs_ientry(&ifp, fs, ino, &bp); 122 daddr = ifp->if_daddr; 123 brelse(bp); 124 125 if (daddr == 0x0) 126 return NULL; 127 128 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp); 129 for (dip = (struct ufs1_dinode *)bp->b_data; 130 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++) 131 if (dip->di_inumber == ino) { 132 r = (struct ufs1_dinode *)malloc(sizeof(*r)); 133 memcpy(r, dip, sizeof(*r)); 134 brelse(bp); 135 return r; 136 } 137 brelse(bp); 138 return NULL; 139 } 140 141 /* 142 * Find out if this inode's data blocks are discontinuous; if they are, 143 * rewrite them using markv. Return the number of inodes rewritten. 144 */ 145 static int 146 clean_inode(struct clfs *fs, ino_t ino) 147 { 148 BLOCK_INFO *bip = NULL, *tbip; 149 CLEANERINFO cip; 150 struct ubuf *bp; 151 struct ufs1_dinode *dip; 152 struct clfs_seguse *sup; 153 struct lfs_fcntl_markv /* { 154 BLOCK_INFO *blkiov; 155 int blkcnt; 156 } */ lim; 157 daddr_t toff; 158 int i; 159 int nb, onb, noff; 160 int retval; 161 int bps; 162 163 dip = get_dinode(fs, ino); 164 if (dip == NULL) 165 return COALESCE_NOINODE; 166 167 /* Compute file block size, set up for bmapv */ 168 onb = nb = lblkno(fs, dip->di_size); 169 170 /* XXX for now, don't do any file small enough to have fragments */ 171 if (nb < NDADDR) { 172 free(dip); 173 return COALESCE_TOOSMALL; 174 } 175 176 /* Sanity checks */ 177 if (dip->di_size < 0) { 178 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size); 179 free(dip); 180 return COALESCE_BADSIZE; 181 } 182 if (nb > dip->di_blocks) { 183 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb, 184 dip->di_blocks); 185 free(dip); 186 return COALESCE_BADBLOCKSIZE; 187 } 188 189 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb); 190 if (bip == NULL) { 191 syslog(LOG_WARNING, "ino %llu, %d blocks: %m", 192 (unsigned long long)ino, nb); 193 free(dip); 194 return COALESCE_NOMEM; 195 } 196 for (i = 0; i < nb; i++) { 197 memset(bip + i, 0, sizeof(BLOCK_INFO)); 198 bip[i].bi_inode = ino; 199 bip[i].bi_lbn = i; 200 bip[i].bi_version = dip->di_gen; 201 /* Don't set the size, but let lfs_bmap fill it in */ 202 } 203 lim.blkiov = bip; 204 lim.blkcnt = nb; 205 if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) { 206 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m", 207 fs->lfs_fsmnt); 208 retval = COALESCE_BADBMAPV; 209 goto out; 210 } 211 #if 0 212 for (i = 0; i < nb; i++) { 213 printf("bi_size = %d, bi_ino = %d, " 214 "bi_lbn = %d, bi_daddr = %d\n", 215 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn, 216 bip[i].bi_daddr); 217 } 218 #endif 219 noff = toff = 0; 220 for (i = 1; i < nb; i++) { 221 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag) 222 ++noff; 223 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr 224 - fs->lfs_frag) >> fs->lfs_fbshift; 225 } 226 227 /* 228 * If this file is not discontinuous, there's no point in rewriting it. 229 * 230 * Explicitly allow a certain amount of discontinuity, since large 231 * files will be broken among segments and medium-sized files 232 * can have a break or two and it's okay. 233 */ 234 if (nb <= 1 || noff == 0 || noff < log2int(nb) || 235 segtod(fs, noff) * 2 < nb) { 236 retval = COALESCE_NOTWORTHIT; 237 goto out; 238 } else if (debug) 239 syslog(LOG_DEBUG, "ino %llu total discontinuity " 240 "%d (%lld) for %d blocks", (unsigned long long)ino, 241 noff, (long long)toff, nb); 242 243 /* Search for blocks in active segments; don't move them. */ 244 for (i = 0; i < nb; i++) { 245 if (bip[i].bi_daddr <= 0) 246 continue; 247 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)]; 248 if (sup->flags & SEGUSE_ACTIVE) 249 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */ 250 } 251 252 /* 253 * Get rid of any blocks we've marked dead. If this is an older 254 * kernel that doesn't have bmapv fill in the block sizes, we'll 255 * toss everything here. 256 */ 257 onb = nb; 258 toss_old_blocks(fs, &bip, &nb, NULL); 259 nb = i; 260 261 /* 262 * We may have tossed enough blocks that it is no longer worthwhile 263 * to rewrite this inode. 264 */ 265 if (nb == 0 || onb - nb > log2int(onb)) { 266 if (debug) 267 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting"); 268 retval = COALESCE_NOTHINGLEFT; 269 goto out; 270 } 271 272 /* 273 * We are going to rewrite this inode. 274 * For any remaining blocks, read in their contents. 275 */ 276 for (i = 0; i < nb; i++) { 277 bip[i].bi_bp = malloc(bip[i].bi_size); 278 if (bip[i].bi_bp == NULL) { 279 syslog(LOG_WARNING, "allocate block buffer size=%d: %m", 280 bip[i].bi_size); 281 retval = COALESCE_NOMEM; 282 goto out; 283 } 284 285 if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size, 286 fsbtob(fs, bip[i].bi_daddr)) < 0) { 287 retval = COALESCE_EIO; 288 goto out; 289 } 290 } 291 if (debug) 292 syslog(LOG_DEBUG, "ino %llu markv %d blocks", 293 (unsigned long long)ino, nb); 294 295 /* 296 * Write in segment-sized chunks. If at any point we'd write more 297 * than half of the available segments, sleep until that's not 298 * true any more. 299 */ 300 bps = segtod(fs, 1); 301 for (tbip = bip; tbip < bip + nb; tbip += bps) { 302 do { 303 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp); 304 cip = *(CLEANERINFO *)bp->b_data; 305 bp->b_flags |= B_INVAL; 306 brelse(bp); 307 308 if (cip.clean < 4) /* XXX magic number 4 */ 309 fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL); 310 } while(cip.clean < 4); 311 312 lim.blkiov = tbip; 313 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps); 314 if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) { 315 retval = COALESCE_BADMARKV; 316 goto out; 317 } 318 } 319 320 retval = COALESCE_OK; 321 out: 322 free(dip); 323 if (bip) { 324 for (i = 0; i < onb; i++) 325 if (bip[i].bi_bp) 326 free(bip[i].bi_bp); 327 free(bip); 328 } 329 return retval; 330 } 331 332 /* 333 * Try coalescing every inode in the filesystem. 334 * Return the number of inodes actually altered. 335 */ 336 int clean_all_inodes(struct clfs *fs) 337 { 338 int i, r, maxino; 339 int totals[COALESCE_MAXERROR]; 340 struct stat st; 341 342 memset(totals, 0, sizeof(totals)); 343 344 fstat(fs->clfs_ifilefd, &st); 345 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) - 346 fs->lfs_segtabsz - fs->lfs_cleansz; 347 348 for (i = 0; i < maxino; i++) { 349 r = clean_inode(fs, i); 350 ++totals[r]; 351 } 352 353 for (i = 0; i < COALESCE_MAXERROR; i++) 354 if (totals[i]) 355 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i], 356 totals[i]); 357 358 return totals[COALESCE_OK]; 359 } 360 361 /* 362 * Fork a child process to coalesce this fs. 363 */ 364 int 365 fork_coalesce(struct clfs *fs) 366 { 367 static pid_t childpid; 368 int num; 369 370 /* 371 * If already running a coalescing child, don't start a new one. 372 */ 373 if (childpid) { 374 if (waitpid(childpid, NULL, WNOHANG) == childpid) 375 childpid = 0; 376 } 377 if (childpid && kill(childpid, 0) >= 0) { 378 /* already running a coalesce process */ 379 if (debug) 380 syslog(LOG_DEBUG, "coalescing already in progress"); 381 return 0; 382 } 383 384 /* 385 * Fork a child and let the child coalease 386 */ 387 childpid = fork(); 388 if (childpid < 0) { 389 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt); 390 return 0; 391 } else if (childpid == 0) { 392 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d", 393 fs->lfs_fsmnt, getpid()); 394 num = clean_all_inodes(fs); 395 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes", 396 fs->lfs_fsmnt, num); 397 exit(0); 398 } 399 400 return 0; 401 } 402