xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision 7fa608457b817eca6e0977b37f758ae064f3c99c)
1 /*      $NetBSD: coalesce.c,v 1.14 2007/10/08 21:41:12 ad Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *      This product includes software developed by the NetBSD
21  *      Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 #include <sys/param.h>
40 #include <sys/mount.h>
41 #include <sys/time.h>
42 #include <sys/resource.h>
43 #include <sys/types.h>
44 #include <sys/wait.h>
45 #include <sys/mman.h>
46 
47 #include <ufs/ufs/dinode.h>
48 #include <ufs/lfs/lfs.h>
49 
50 #include <fcntl.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <time.h>
56 #include <unistd.h>
57 #include <util.h>
58 #include <errno.h>
59 #include <err.h>
60 
61 #include <syslog.h>
62 
63 #include "bufcache.h"
64 #include "vnode.h"
65 #include "cleaner.h"
66 
67 extern int debug, do_mmap;
68 
69 int log2int(int n)
70 {
71 	int log;
72 
73 	log = 0;
74 	while (n > 0) {
75 		++log;
76 		n >>= 1;
77 	}
78 	return log - 1;
79 }
80 
81 enum coalesce_returncodes {
82 	COALESCE_OK = 0,
83 	COALESCE_NOINODE,
84 	COALESCE_TOOSMALL,
85 	COALESCE_BADSIZE,
86 	COALESCE_BADBLOCKSIZE,
87 	COALESCE_NOMEM,
88 	COALESCE_BADBMAPV,
89 	COALESCE_BADMARKV,
90 	COALESCE_NOTWORTHIT,
91 	COALESCE_NOTHINGLEFT,
92 	COALESCE_EIO,
93 
94 	COALESCE_MAXERROR
95 };
96 
97 char *coalesce_return[] = {
98 	"Successfully coalesced",
99 	"File not in use or inode not found",
100 	"Not large enough to coalesce",
101 	"Negative size",
102 	"Not enough blocks to account for size",
103 	"Malloc failed",
104 	"LFCNBMAPV failed",
105 	"Not broken enough to fix",
106 	"Too many blocks not found",
107 	"Too many blocks found in active segments",
108 	"I/O error",
109 
110 	"No such error"
111 };
112 
113 static struct ufs1_dinode *
114 get_dinode(struct clfs *fs, ino_t ino)
115 {
116 	IFILE *ifp;
117 	daddr_t daddr;
118 	struct ubuf *bp;
119 	struct ufs1_dinode *dip, *r;
120 
121 	lfs_ientry(&ifp, fs, ino, &bp);
122 	daddr = ifp->if_daddr;
123 	brelse(bp, 0);
124 
125 	if (daddr == 0x0)
126 		return NULL;
127 
128 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
129 	for (dip = (struct ufs1_dinode *)bp->b_data;
130 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
131 		if (dip->di_inumber == ino) {
132 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
133 			memcpy(r, dip, sizeof(*r));
134 			brelse(bp, 0);
135 			return r;
136 		}
137 	brelse(bp, 0);
138 	return NULL;
139 }
140 
141 /*
142  * Find out if this inode's data blocks are discontinuous; if they are,
143  * rewrite them using markv.  Return the number of inodes rewritten.
144  */
145 static int
146 clean_inode(struct clfs *fs, ino_t ino)
147 {
148 	BLOCK_INFO *bip = NULL, *tbip;
149 	CLEANERINFO cip;
150 	struct ubuf *bp;
151 	struct ufs1_dinode *dip;
152 	struct clfs_seguse *sup;
153 	struct lfs_fcntl_markv /* {
154 		BLOCK_INFO *blkiov;
155 		int blkcnt;
156 	} */ lim;
157 	daddr_t toff;
158 	int i;
159 	int nb, onb, noff;
160 	int retval;
161 	int bps;
162 
163 	dip = get_dinode(fs, ino);
164 	if (dip == NULL)
165 		return COALESCE_NOINODE;
166 
167 	/* Compute file block size, set up for bmapv */
168 	onb = nb = lblkno(fs, dip->di_size);
169 
170 	/* XXX for now, don't do any file small enough to have fragments */
171 	if (nb < NDADDR) {
172 		free(dip);
173 		return COALESCE_TOOSMALL;
174 	}
175 
176 	/* Sanity checks */
177 	if (dip->di_size < 0) {
178 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
179 		free(dip);
180 		return COALESCE_BADSIZE;
181 	}
182 	if (nb > dip->di_blocks) {
183 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
184 		     dip->di_blocks);
185 		free(dip);
186 		return COALESCE_BADBLOCKSIZE;
187 	}
188 
189 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
190 	if (bip == NULL) {
191 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
192 		    (unsigned long long)ino, nb);
193 		free(dip);
194 		return COALESCE_NOMEM;
195 	}
196 	for (i = 0; i < nb; i++) {
197 		memset(bip + i, 0, sizeof(BLOCK_INFO));
198 		bip[i].bi_inode = ino;
199 		bip[i].bi_lbn = i;
200 		bip[i].bi_version = dip->di_gen;
201 		/* Don't set the size, but let lfs_bmap fill it in */
202 	}
203 	lim.blkiov = bip;
204 	lim.blkcnt = nb;
205 	if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
206 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
207 		       fs->lfs_fsmnt);
208 		retval = COALESCE_BADBMAPV;
209 		goto out;
210 	}
211 #if 0
212 	for (i = 0; i < nb; i++) {
213 		printf("bi_size = %d, bi_ino = %d, "
214 		    "bi_lbn = %d, bi_daddr = %d\n",
215 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
216 		    bip[i].bi_daddr);
217 	}
218 #endif
219 	noff = toff = 0;
220 	for (i = 1; i < nb; i++) {
221 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
222 			++noff;
223 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
224 		    - fs->lfs_frag) >> fs->lfs_fbshift;
225 	}
226 
227 	/*
228 	 * If this file is not discontinuous, there's no point in rewriting it.
229 	 *
230 	 * Explicitly allow a certain amount of discontinuity, since large
231 	 * files will be broken among segments and medium-sized files
232 	 * can have a break or two and it's okay.
233 	 */
234 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
235 	    segtod(fs, noff) * 2 < nb) {
236 		retval = COALESCE_NOTWORTHIT;
237 		goto out;
238 	} else if (debug)
239 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
240 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
241 		    noff, (long long)toff, nb);
242 
243 	/* Search for blocks in active segments; don't move them. */
244 	for (i = 0; i < nb; i++) {
245 		if (bip[i].bi_daddr <= 0)
246 			continue;
247 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
248 		if (sup->flags & SEGUSE_ACTIVE)
249 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
250 	}
251 
252 	/*
253 	 * Get rid of any blocks we've marked dead.  If this is an older
254 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
255 	 * toss everything here.
256 	 */
257 	onb = nb;
258 	toss_old_blocks(fs, &bip, &nb, NULL);
259 	nb = i;
260 
261 	/*
262 	 * We may have tossed enough blocks that it is no longer worthwhile
263 	 * to rewrite this inode.
264 	 */
265 	if (nb == 0 || onb - nb > log2int(onb)) {
266 		if (debug)
267 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
268 		retval = COALESCE_NOTHINGLEFT;
269 		goto out;
270 	}
271 
272 	/*
273 	 * We are going to rewrite this inode.
274 	 * For any remaining blocks, read in their contents.
275 	 */
276 	for (i = 0; i < nb; i++) {
277 		bip[i].bi_bp = malloc(bip[i].bi_size);
278 		if (bip[i].bi_bp == NULL) {
279 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
280 			    bip[i].bi_size);
281 			retval = COALESCE_NOMEM;
282 			goto out;
283 		}
284 
285 		if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
286 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
287 			retval = COALESCE_EIO;
288 			goto out;
289 		}
290 	}
291 	if (debug)
292 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
293 		    (unsigned long long)ino, nb);
294 
295 	/*
296 	 * Write in segment-sized chunks.  If at any point we'd write more
297 	 * than half of the available segments, sleep until that's not
298 	 * true any more.
299 	 */
300 	bps = segtod(fs, 1);
301 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
302 		do {
303 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
304 			cip = *(CLEANERINFO *)bp->b_data;
305 			brelse(bp, B_INVAL);
306 
307 			if (cip.clean < 4) /* XXX magic number 4 */
308 				fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
309 		} while(cip.clean < 4);
310 
311 		lim.blkiov = tbip;
312 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
313 		if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
314 			retval = COALESCE_BADMARKV;
315 			goto out;
316 		}
317 	}
318 
319 	retval = COALESCE_OK;
320 out:
321 	free(dip);
322 	if (bip) {
323 		for (i = 0; i < onb; i++)
324 			if (bip[i].bi_bp)
325 				free(bip[i].bi_bp);
326 		free(bip);
327 	}
328 	return retval;
329 }
330 
331 /*
332  * Try coalescing every inode in the filesystem.
333  * Return the number of inodes actually altered.
334  */
335 int clean_all_inodes(struct clfs *fs)
336 {
337 	int i, r, maxino;
338 	int totals[COALESCE_MAXERROR];
339 	struct stat st;
340 
341 	memset(totals, 0, sizeof(totals));
342 
343 	fstat(fs->clfs_ifilefd, &st);
344 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
345 		fs->lfs_segtabsz - fs->lfs_cleansz;
346 
347 	for (i = 0; i < maxino; i++) {
348 		r = clean_inode(fs, i);
349 		++totals[r];
350 	}
351 
352 	for (i = 0; i < COALESCE_MAXERROR; i++)
353 		if (totals[i])
354 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
355 			       totals[i]);
356 
357 	return totals[COALESCE_OK];
358 }
359 
360 /*
361  * Fork a child process to coalesce this fs.
362  */
363 int
364 fork_coalesce(struct clfs *fs)
365 {
366 	static pid_t childpid;
367 	int num;
368 
369 	/*
370 	 * If already running a coalescing child, don't start a new one.
371 	 */
372 	if (childpid) {
373 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
374 			childpid = 0;
375 	}
376 	if (childpid && kill(childpid, 0) >= 0) {
377 		/* already running a coalesce process */
378 		if (debug)
379 			syslog(LOG_DEBUG, "coalescing already in progress");
380 		return 0;
381 	}
382 
383 	/*
384 	 * Fork a child and let the child coalease
385 	 */
386 	childpid = fork();
387 	if (childpid < 0) {
388 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
389 		return 0;
390 	} else if (childpid == 0) {
391 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
392 		       fs->lfs_fsmnt, getpid());
393 		num = clean_all_inodes(fs);
394 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
395 		       fs->lfs_fsmnt, num);
396 		exit(0);
397 	}
398 
399 	return 0;
400 }
401