1 /* $NetBSD: coalesce.c,v 1.23 2013/06/18 18:18:57 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/mount.h> 34 #include <sys/time.h> 35 #include <sys/resource.h> 36 #include <sys/types.h> 37 #include <sys/wait.h> 38 #include <sys/mman.h> 39 40 #include <ufs/lfs/lfs.h> 41 42 #include <fcntl.h> 43 #include <signal.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <time.h> 48 #include <unistd.h> 49 #include <util.h> 50 #include <errno.h> 51 #include <err.h> 52 53 #include <syslog.h> 54 55 #include "bufcache.h" 56 #include "vnode.h" 57 #include "cleaner.h" 58 #include "kernelops.h" 59 60 extern int debug, do_mmap; 61 62 int log2int(int n) 63 { 64 int log; 65 66 log = 0; 67 while (n > 0) { 68 ++log; 69 n >>= 1; 70 } 71 return log - 1; 72 } 73 74 enum coalesce_returncodes { 75 COALESCE_OK = 0, 76 COALESCE_NOINODE, 77 COALESCE_TOOSMALL, 78 COALESCE_BADSIZE, 79 COALESCE_BADBLOCKSIZE, 80 COALESCE_NOMEM, 81 COALESCE_BADBMAPV, 82 COALESCE_BADMARKV, 83 COALESCE_NOTWORTHIT, 84 COALESCE_NOTHINGLEFT, 85 COALESCE_EIO, 86 87 COALESCE_MAXERROR 88 }; 89 90 const char *coalesce_return[] = { 91 "Successfully coalesced", 92 "File not in use or inode not found", 93 "Not large enough to coalesce", 94 "Negative size", 95 "Not enough blocks to account for size", 96 "Malloc failed", 97 "LFCNBMAPV failed", 98 "Not broken enough to fix", 99 "Too many blocks not found", 100 "Too many blocks found in active segments", 101 "I/O error", 102 103 "No such error" 104 }; 105 106 static struct ulfs1_dinode * 107 get_dinode(struct clfs *fs, ino_t ino) 108 { 109 IFILE *ifp; 110 daddr_t daddr; 111 struct ubuf *bp; 112 struct ulfs1_dinode *dip, *r; 113 114 lfs_ientry(&ifp, fs, ino, &bp); 115 daddr = ifp->if_daddr; 116 brelse(bp, 0); 117 118 if (daddr == 0x0) 119 return NULL; 120 121 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp); 122 for (dip = (struct ulfs1_dinode *)bp->b_data; 123 dip < (struct ulfs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++) 124 if (dip->di_inumber == ino) { 125 r = (struct ulfs1_dinode *)malloc(sizeof(*r)); 126 if (r == NULL) 127 break; 128 memcpy(r, dip, sizeof(*r)); 129 brelse(bp, 0); 130 return r; 131 } 132 brelse(bp, 0); 133 return NULL; 134 } 135 136 /* 137 * Find out if this inode's data blocks are discontinuous; if they are, 138 * rewrite them using markv. Return the number of inodes rewritten. 139 */ 140 static int 141 clean_inode(struct clfs *fs, ino_t ino) 142 { 143 BLOCK_INFO *bip = NULL, *tbip; 144 CLEANERINFO cip; 145 struct ubuf *bp; 146 struct ulfs1_dinode *dip; 147 struct clfs_seguse *sup; 148 struct lfs_fcntl_markv /* { 149 BLOCK_INFO *blkiov; 150 int blkcnt; 151 } */ lim; 152 daddr_t toff; 153 int i; 154 int nb, onb, noff; 155 int retval; 156 int bps; 157 158 dip = get_dinode(fs, ino); 159 if (dip == NULL) 160 return COALESCE_NOINODE; 161 162 /* Compute file block size, set up for bmapv */ 163 onb = nb = lfs_lblkno(fs, dip->di_size); 164 165 /* XXX for now, don't do any file small enough to have fragments */ 166 if (nb < ULFS_NDADDR) { 167 free(dip); 168 return COALESCE_TOOSMALL; 169 } 170 171 /* Sanity checks */ 172 #if 0 /* di_size is uint64_t -- this is a noop */ 173 if (dip->di_size < 0) { 174 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size); 175 free(dip); 176 return COALESCE_BADSIZE; 177 } 178 #endif 179 if (nb > dip->di_blocks) { 180 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb, 181 dip->di_blocks); 182 free(dip); 183 return COALESCE_BADBLOCKSIZE; 184 } 185 186 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb); 187 if (bip == NULL) { 188 syslog(LOG_WARNING, "ino %llu, %d blocks: %m", 189 (unsigned long long)ino, nb); 190 free(dip); 191 return COALESCE_NOMEM; 192 } 193 for (i = 0; i < nb; i++) { 194 memset(bip + i, 0, sizeof(BLOCK_INFO)); 195 bip[i].bi_inode = ino; 196 bip[i].bi_lbn = i; 197 bip[i].bi_version = dip->di_gen; 198 /* Don't set the size, but let lfs_bmap fill it in */ 199 } 200 lim.blkiov = bip; 201 lim.blkcnt = nb; 202 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) { 203 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m", 204 fs->lfs_fsmnt); 205 retval = COALESCE_BADBMAPV; 206 goto out; 207 } 208 #if 0 209 for (i = 0; i < nb; i++) { 210 printf("bi_size = %d, bi_ino = %d, " 211 "bi_lbn = %d, bi_daddr = %d\n", 212 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn, 213 bip[i].bi_daddr); 214 } 215 #endif 216 noff = toff = 0; 217 for (i = 1; i < nb; i++) { 218 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag) 219 ++noff; 220 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr 221 - fs->lfs_frag) >> fs->lfs_fbshift; 222 } 223 224 /* 225 * If this file is not discontinuous, there's no point in rewriting it. 226 * 227 * Explicitly allow a certain amount of discontinuity, since large 228 * files will be broken among segments and medium-sized files 229 * can have a break or two and it's okay. 230 */ 231 if (nb <= 1 || noff == 0 || noff < log2int(nb) || 232 lfs_segtod(fs, noff) * 2 < nb) { 233 retval = COALESCE_NOTWORTHIT; 234 goto out; 235 } else if (debug) 236 syslog(LOG_DEBUG, "ino %llu total discontinuity " 237 "%d (%lld) for %d blocks", (unsigned long long)ino, 238 noff, (long long)toff, nb); 239 240 /* Search for blocks in active segments; don't move them. */ 241 for (i = 0; i < nb; i++) { 242 if (bip[i].bi_daddr <= 0) 243 continue; 244 sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)]; 245 if (sup->flags & SEGUSE_ACTIVE) 246 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */ 247 } 248 249 /* 250 * Get rid of any blocks we've marked dead. If this is an older 251 * kernel that doesn't have bmapv fill in the block sizes, we'll 252 * toss everything here. 253 */ 254 onb = nb; 255 toss_old_blocks(fs, &bip, &nb, NULL); 256 nb = i; 257 258 /* 259 * We may have tossed enough blocks that it is no longer worthwhile 260 * to rewrite this inode. 261 */ 262 if (nb == 0 || onb - nb > log2int(onb)) { 263 if (debug) 264 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting"); 265 retval = COALESCE_NOTHINGLEFT; 266 goto out; 267 } 268 269 /* 270 * We are going to rewrite this inode. 271 * For any remaining blocks, read in their contents. 272 */ 273 for (i = 0; i < nb; i++) { 274 bip[i].bi_bp = malloc(bip[i].bi_size); 275 if (bip[i].bi_bp == NULL) { 276 syslog(LOG_WARNING, "allocate block buffer size=%d: %m", 277 bip[i].bi_size); 278 retval = COALESCE_NOMEM; 279 goto out; 280 } 281 282 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size, 283 lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) { 284 retval = COALESCE_EIO; 285 goto out; 286 } 287 } 288 if (debug) 289 syslog(LOG_DEBUG, "ino %llu markv %d blocks", 290 (unsigned long long)ino, nb); 291 292 /* 293 * Write in segment-sized chunks. If at any point we'd write more 294 * than half of the available segments, sleep until that's not 295 * true any more. 296 */ 297 bps = lfs_segtod(fs, 1); 298 for (tbip = bip; tbip < bip + nb; tbip += bps) { 299 do { 300 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp); 301 cip = *(CLEANERINFO *)bp->b_data; 302 brelse(bp, B_INVAL); 303 304 if (cip.clean < 4) /* XXX magic number 4 */ 305 kops.ko_fcntl(fs->clfs_ifilefd, 306 LFCNSEGWAIT, NULL); 307 } while(cip.clean < 4); 308 309 lim.blkiov = tbip; 310 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps); 311 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) { 312 retval = COALESCE_BADMARKV; 313 goto out; 314 } 315 } 316 317 retval = COALESCE_OK; 318 out: 319 free(dip); 320 if (bip) { 321 for (i = 0; i < onb; i++) 322 if (bip[i].bi_bp) 323 free(bip[i].bi_bp); 324 free(bip); 325 } 326 return retval; 327 } 328 329 /* 330 * Try coalescing every inode in the filesystem. 331 * Return the number of inodes actually altered. 332 */ 333 int clean_all_inodes(struct clfs *fs) 334 { 335 int i, r, maxino; 336 int totals[COALESCE_MAXERROR]; 337 struct stat st; 338 339 memset(totals, 0, sizeof(totals)); 340 341 fstat(fs->clfs_ifilefd, &st); 342 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) - 343 fs->lfs_segtabsz - fs->lfs_cleansz; 344 345 for (i = 0; i < maxino; i++) { 346 r = clean_inode(fs, i); 347 ++totals[r]; 348 } 349 350 for (i = 0; i < COALESCE_MAXERROR; i++) 351 if (totals[i]) 352 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i], 353 totals[i]); 354 355 return totals[COALESCE_OK]; 356 } 357 358 /* 359 * Fork a child process to coalesce this fs. 360 */ 361 int 362 fork_coalesce(struct clfs *fs) 363 { 364 static pid_t childpid; 365 int num; 366 367 /* 368 * If already running a coalescing child, don't start a new one. 369 */ 370 if (childpid) { 371 if (waitpid(childpid, NULL, WNOHANG) == childpid) 372 childpid = 0; 373 } 374 if (childpid && kill(childpid, 0) >= 0) { 375 /* already running a coalesce process */ 376 if (debug) 377 syslog(LOG_DEBUG, "coalescing already in progress"); 378 return 0; 379 } 380 381 /* 382 * Fork a child and let the child coalease 383 */ 384 childpid = fork(); 385 if (childpid < 0) { 386 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt); 387 return 0; 388 } else if (childpid == 0) { 389 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d", 390 fs->lfs_fsmnt, getpid()); 391 num = clean_all_inodes(fs); 392 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes", 393 fs->lfs_fsmnt, num); 394 exit(0); 395 } 396 397 return 0; 398 } 399