xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision 274254cdae52594c1aa480a736aef78313d15c9c)
1 /*      $NetBSD: coalesce.c,v 1.17 2009/03/16 00:08:10 lukem Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39 
40 #include <ufs/ufs/dinode.h>
41 #include <ufs/lfs/lfs.h>
42 
43 #include <fcntl.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
49 #include <unistd.h>
50 #include <util.h>
51 #include <errno.h>
52 #include <err.h>
53 
54 #include <syslog.h>
55 
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 
60 extern int debug, do_mmap;
61 
62 int log2int(int n)
63 {
64 	int log;
65 
66 	log = 0;
67 	while (n > 0) {
68 		++log;
69 		n >>= 1;
70 	}
71 	return log - 1;
72 }
73 
74 enum coalesce_returncodes {
75 	COALESCE_OK = 0,
76 	COALESCE_NOINODE,
77 	COALESCE_TOOSMALL,
78 	COALESCE_BADSIZE,
79 	COALESCE_BADBLOCKSIZE,
80 	COALESCE_NOMEM,
81 	COALESCE_BADBMAPV,
82 	COALESCE_BADMARKV,
83 	COALESCE_NOTWORTHIT,
84 	COALESCE_NOTHINGLEFT,
85 	COALESCE_EIO,
86 
87 	COALESCE_MAXERROR
88 };
89 
90 const char *coalesce_return[] = {
91 	"Successfully coalesced",
92 	"File not in use or inode not found",
93 	"Not large enough to coalesce",
94 	"Negative size",
95 	"Not enough blocks to account for size",
96 	"Malloc failed",
97 	"LFCNBMAPV failed",
98 	"Not broken enough to fix",
99 	"Too many blocks not found",
100 	"Too many blocks found in active segments",
101 	"I/O error",
102 
103 	"No such error"
104 };
105 
106 static struct ufs1_dinode *
107 get_dinode(struct clfs *fs, ino_t ino)
108 {
109 	IFILE *ifp;
110 	daddr_t daddr;
111 	struct ubuf *bp;
112 	struct ufs1_dinode *dip, *r;
113 
114 	lfs_ientry(&ifp, fs, ino, &bp);
115 	daddr = ifp->if_daddr;
116 	brelse(bp, 0);
117 
118 	if (daddr == 0x0)
119 		return NULL;
120 
121 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
122 	for (dip = (struct ufs1_dinode *)bp->b_data;
123 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
124 		if (dip->di_inumber == ino) {
125 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
126 			memcpy(r, dip, sizeof(*r));
127 			brelse(bp, 0);
128 			return r;
129 		}
130 	brelse(bp, 0);
131 	return NULL;
132 }
133 
134 /*
135  * Find out if this inode's data blocks are discontinuous; if they are,
136  * rewrite them using markv.  Return the number of inodes rewritten.
137  */
138 static int
139 clean_inode(struct clfs *fs, ino_t ino)
140 {
141 	BLOCK_INFO *bip = NULL, *tbip;
142 	CLEANERINFO cip;
143 	struct ubuf *bp;
144 	struct ufs1_dinode *dip;
145 	struct clfs_seguse *sup;
146 	struct lfs_fcntl_markv /* {
147 		BLOCK_INFO *blkiov;
148 		int blkcnt;
149 	} */ lim;
150 	daddr_t toff;
151 	int i;
152 	int nb, onb, noff;
153 	int retval;
154 	int bps;
155 
156 	dip = get_dinode(fs, ino);
157 	if (dip == NULL)
158 		return COALESCE_NOINODE;
159 
160 	/* Compute file block size, set up for bmapv */
161 	onb = nb = lblkno(fs, dip->di_size);
162 
163 	/* XXX for now, don't do any file small enough to have fragments */
164 	if (nb < NDADDR) {
165 		free(dip);
166 		return COALESCE_TOOSMALL;
167 	}
168 
169 	/* Sanity checks */
170 #if 0	/* di_size is uint64_t -- this is a noop */
171 	if (dip->di_size < 0) {
172 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
173 		free(dip);
174 		return COALESCE_BADSIZE;
175 	}
176 #endif
177 	if (nb > dip->di_blocks) {
178 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
179 		     dip->di_blocks);
180 		free(dip);
181 		return COALESCE_BADBLOCKSIZE;
182 	}
183 
184 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
185 	if (bip == NULL) {
186 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
187 		    (unsigned long long)ino, nb);
188 		free(dip);
189 		return COALESCE_NOMEM;
190 	}
191 	for (i = 0; i < nb; i++) {
192 		memset(bip + i, 0, sizeof(BLOCK_INFO));
193 		bip[i].bi_inode = ino;
194 		bip[i].bi_lbn = i;
195 		bip[i].bi_version = dip->di_gen;
196 		/* Don't set the size, but let lfs_bmap fill it in */
197 	}
198 	lim.blkiov = bip;
199 	lim.blkcnt = nb;
200 	if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
201 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
202 		       fs->lfs_fsmnt);
203 		retval = COALESCE_BADBMAPV;
204 		goto out;
205 	}
206 #if 0
207 	for (i = 0; i < nb; i++) {
208 		printf("bi_size = %d, bi_ino = %d, "
209 		    "bi_lbn = %d, bi_daddr = %d\n",
210 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
211 		    bip[i].bi_daddr);
212 	}
213 #endif
214 	noff = toff = 0;
215 	for (i = 1; i < nb; i++) {
216 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
217 			++noff;
218 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
219 		    - fs->lfs_frag) >> fs->lfs_fbshift;
220 	}
221 
222 	/*
223 	 * If this file is not discontinuous, there's no point in rewriting it.
224 	 *
225 	 * Explicitly allow a certain amount of discontinuity, since large
226 	 * files will be broken among segments and medium-sized files
227 	 * can have a break or two and it's okay.
228 	 */
229 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
230 	    segtod(fs, noff) * 2 < nb) {
231 		retval = COALESCE_NOTWORTHIT;
232 		goto out;
233 	} else if (debug)
234 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
235 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
236 		    noff, (long long)toff, nb);
237 
238 	/* Search for blocks in active segments; don't move them. */
239 	for (i = 0; i < nb; i++) {
240 		if (bip[i].bi_daddr <= 0)
241 			continue;
242 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
243 		if (sup->flags & SEGUSE_ACTIVE)
244 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
245 	}
246 
247 	/*
248 	 * Get rid of any blocks we've marked dead.  If this is an older
249 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
250 	 * toss everything here.
251 	 */
252 	onb = nb;
253 	toss_old_blocks(fs, &bip, &nb, NULL);
254 	nb = i;
255 
256 	/*
257 	 * We may have tossed enough blocks that it is no longer worthwhile
258 	 * to rewrite this inode.
259 	 */
260 	if (nb == 0 || onb - nb > log2int(onb)) {
261 		if (debug)
262 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
263 		retval = COALESCE_NOTHINGLEFT;
264 		goto out;
265 	}
266 
267 	/*
268 	 * We are going to rewrite this inode.
269 	 * For any remaining blocks, read in their contents.
270 	 */
271 	for (i = 0; i < nb; i++) {
272 		bip[i].bi_bp = malloc(bip[i].bi_size);
273 		if (bip[i].bi_bp == NULL) {
274 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
275 			    bip[i].bi_size);
276 			retval = COALESCE_NOMEM;
277 			goto out;
278 		}
279 
280 		if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
281 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
282 			retval = COALESCE_EIO;
283 			goto out;
284 		}
285 	}
286 	if (debug)
287 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
288 		    (unsigned long long)ino, nb);
289 
290 	/*
291 	 * Write in segment-sized chunks.  If at any point we'd write more
292 	 * than half of the available segments, sleep until that's not
293 	 * true any more.
294 	 */
295 	bps = segtod(fs, 1);
296 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
297 		do {
298 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
299 			cip = *(CLEANERINFO *)bp->b_data;
300 			brelse(bp, B_INVAL);
301 
302 			if (cip.clean < 4) /* XXX magic number 4 */
303 				fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
304 		} while(cip.clean < 4);
305 
306 		lim.blkiov = tbip;
307 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
308 		if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
309 			retval = COALESCE_BADMARKV;
310 			goto out;
311 		}
312 	}
313 
314 	retval = COALESCE_OK;
315 out:
316 	free(dip);
317 	if (bip) {
318 		for (i = 0; i < onb; i++)
319 			if (bip[i].bi_bp)
320 				free(bip[i].bi_bp);
321 		free(bip);
322 	}
323 	return retval;
324 }
325 
326 /*
327  * Try coalescing every inode in the filesystem.
328  * Return the number of inodes actually altered.
329  */
330 int clean_all_inodes(struct clfs *fs)
331 {
332 	int i, r, maxino;
333 	int totals[COALESCE_MAXERROR];
334 	struct stat st;
335 
336 	memset(totals, 0, sizeof(totals));
337 
338 	fstat(fs->clfs_ifilefd, &st);
339 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
340 		fs->lfs_segtabsz - fs->lfs_cleansz;
341 
342 	for (i = 0; i < maxino; i++) {
343 		r = clean_inode(fs, i);
344 		++totals[r];
345 	}
346 
347 	for (i = 0; i < COALESCE_MAXERROR; i++)
348 		if (totals[i])
349 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
350 			       totals[i]);
351 
352 	return totals[COALESCE_OK];
353 }
354 
355 /*
356  * Fork a child process to coalesce this fs.
357  */
358 int
359 fork_coalesce(struct clfs *fs)
360 {
361 	static pid_t childpid;
362 	int num;
363 
364 	/*
365 	 * If already running a coalescing child, don't start a new one.
366 	 */
367 	if (childpid) {
368 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
369 			childpid = 0;
370 	}
371 	if (childpid && kill(childpid, 0) >= 0) {
372 		/* already running a coalesce process */
373 		if (debug)
374 			syslog(LOG_DEBUG, "coalescing already in progress");
375 		return 0;
376 	}
377 
378 	/*
379 	 * Fork a child and let the child coalease
380 	 */
381 	childpid = fork();
382 	if (childpid < 0) {
383 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
384 		return 0;
385 	} else if (childpid == 0) {
386 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
387 		       fs->lfs_fsmnt, getpid());
388 		num = clean_all_inodes(fs);
389 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
390 		       fs->lfs_fsmnt, num);
391 		exit(0);
392 	}
393 
394 	return 0;
395 }
396