xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision daf6c4152fcddc27c445489775ed1f66ab4ea9a9)
1 /*      $NetBSD: coalesce.c,v 1.18 2009/08/06 00:51:55 pooka Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39 
40 #include <ufs/ufs/dinode.h>
41 #include <ufs/lfs/lfs.h>
42 
43 #include <fcntl.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
49 #include <unistd.h>
50 #include <util.h>
51 #include <errno.h>
52 #include <err.h>
53 
54 #include <syslog.h>
55 
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60 
61 extern int debug, do_mmap;
62 
63 int log2int(int n)
64 {
65 	int log;
66 
67 	log = 0;
68 	while (n > 0) {
69 		++log;
70 		n >>= 1;
71 	}
72 	return log - 1;
73 }
74 
75 enum coalesce_returncodes {
76 	COALESCE_OK = 0,
77 	COALESCE_NOINODE,
78 	COALESCE_TOOSMALL,
79 	COALESCE_BADSIZE,
80 	COALESCE_BADBLOCKSIZE,
81 	COALESCE_NOMEM,
82 	COALESCE_BADBMAPV,
83 	COALESCE_BADMARKV,
84 	COALESCE_NOTWORTHIT,
85 	COALESCE_NOTHINGLEFT,
86 	COALESCE_EIO,
87 
88 	COALESCE_MAXERROR
89 };
90 
91 const char *coalesce_return[] = {
92 	"Successfully coalesced",
93 	"File not in use or inode not found",
94 	"Not large enough to coalesce",
95 	"Negative size",
96 	"Not enough blocks to account for size",
97 	"Malloc failed",
98 	"LFCNBMAPV failed",
99 	"Not broken enough to fix",
100 	"Too many blocks not found",
101 	"Too many blocks found in active segments",
102 	"I/O error",
103 
104 	"No such error"
105 };
106 
107 static struct ufs1_dinode *
108 get_dinode(struct clfs *fs, ino_t ino)
109 {
110 	IFILE *ifp;
111 	daddr_t daddr;
112 	struct ubuf *bp;
113 	struct ufs1_dinode *dip, *r;
114 
115 	lfs_ientry(&ifp, fs, ino, &bp);
116 	daddr = ifp->if_daddr;
117 	brelse(bp, 0);
118 
119 	if (daddr == 0x0)
120 		return NULL;
121 
122 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
123 	for (dip = (struct ufs1_dinode *)bp->b_data;
124 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
125 		if (dip->di_inumber == ino) {
126 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
127 			memcpy(r, dip, sizeof(*r));
128 			brelse(bp, 0);
129 			return r;
130 		}
131 	brelse(bp, 0);
132 	return NULL;
133 }
134 
135 /*
136  * Find out if this inode's data blocks are discontinuous; if they are,
137  * rewrite them using markv.  Return the number of inodes rewritten.
138  */
139 static int
140 clean_inode(struct clfs *fs, ino_t ino)
141 {
142 	BLOCK_INFO *bip = NULL, *tbip;
143 	CLEANERINFO cip;
144 	struct ubuf *bp;
145 	struct ufs1_dinode *dip;
146 	struct clfs_seguse *sup;
147 	struct lfs_fcntl_markv /* {
148 		BLOCK_INFO *blkiov;
149 		int blkcnt;
150 	} */ lim;
151 	daddr_t toff;
152 	int i;
153 	int nb, onb, noff;
154 	int retval;
155 	int bps;
156 
157 	dip = get_dinode(fs, ino);
158 	if (dip == NULL)
159 		return COALESCE_NOINODE;
160 
161 	/* Compute file block size, set up for bmapv */
162 	onb = nb = lblkno(fs, dip->di_size);
163 
164 	/* XXX for now, don't do any file small enough to have fragments */
165 	if (nb < NDADDR) {
166 		free(dip);
167 		return COALESCE_TOOSMALL;
168 	}
169 
170 	/* Sanity checks */
171 #if 0	/* di_size is uint64_t -- this is a noop */
172 	if (dip->di_size < 0) {
173 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
174 		free(dip);
175 		return COALESCE_BADSIZE;
176 	}
177 #endif
178 	if (nb > dip->di_blocks) {
179 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
180 		     dip->di_blocks);
181 		free(dip);
182 		return COALESCE_BADBLOCKSIZE;
183 	}
184 
185 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
186 	if (bip == NULL) {
187 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
188 		    (unsigned long long)ino, nb);
189 		free(dip);
190 		return COALESCE_NOMEM;
191 	}
192 	for (i = 0; i < nb; i++) {
193 		memset(bip + i, 0, sizeof(BLOCK_INFO));
194 		bip[i].bi_inode = ino;
195 		bip[i].bi_lbn = i;
196 		bip[i].bi_version = dip->di_gen;
197 		/* Don't set the size, but let lfs_bmap fill it in */
198 	}
199 	lim.blkiov = bip;
200 	lim.blkcnt = nb;
201 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
202 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
203 		       fs->lfs_fsmnt);
204 		retval = COALESCE_BADBMAPV;
205 		goto out;
206 	}
207 #if 0
208 	for (i = 0; i < nb; i++) {
209 		printf("bi_size = %d, bi_ino = %d, "
210 		    "bi_lbn = %d, bi_daddr = %d\n",
211 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
212 		    bip[i].bi_daddr);
213 	}
214 #endif
215 	noff = toff = 0;
216 	for (i = 1; i < nb; i++) {
217 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
218 			++noff;
219 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
220 		    - fs->lfs_frag) >> fs->lfs_fbshift;
221 	}
222 
223 	/*
224 	 * If this file is not discontinuous, there's no point in rewriting it.
225 	 *
226 	 * Explicitly allow a certain amount of discontinuity, since large
227 	 * files will be broken among segments and medium-sized files
228 	 * can have a break or two and it's okay.
229 	 */
230 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
231 	    segtod(fs, noff) * 2 < nb) {
232 		retval = COALESCE_NOTWORTHIT;
233 		goto out;
234 	} else if (debug)
235 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
236 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
237 		    noff, (long long)toff, nb);
238 
239 	/* Search for blocks in active segments; don't move them. */
240 	for (i = 0; i < nb; i++) {
241 		if (bip[i].bi_daddr <= 0)
242 			continue;
243 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
244 		if (sup->flags & SEGUSE_ACTIVE)
245 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
246 	}
247 
248 	/*
249 	 * Get rid of any blocks we've marked dead.  If this is an older
250 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
251 	 * toss everything here.
252 	 */
253 	onb = nb;
254 	toss_old_blocks(fs, &bip, &nb, NULL);
255 	nb = i;
256 
257 	/*
258 	 * We may have tossed enough blocks that it is no longer worthwhile
259 	 * to rewrite this inode.
260 	 */
261 	if (nb == 0 || onb - nb > log2int(onb)) {
262 		if (debug)
263 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
264 		retval = COALESCE_NOTHINGLEFT;
265 		goto out;
266 	}
267 
268 	/*
269 	 * We are going to rewrite this inode.
270 	 * For any remaining blocks, read in their contents.
271 	 */
272 	for (i = 0; i < nb; i++) {
273 		bip[i].bi_bp = malloc(bip[i].bi_size);
274 		if (bip[i].bi_bp == NULL) {
275 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
276 			    bip[i].bi_size);
277 			retval = COALESCE_NOMEM;
278 			goto out;
279 		}
280 
281 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
282 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
283 			retval = COALESCE_EIO;
284 			goto out;
285 		}
286 	}
287 	if (debug)
288 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
289 		    (unsigned long long)ino, nb);
290 
291 	/*
292 	 * Write in segment-sized chunks.  If at any point we'd write more
293 	 * than half of the available segments, sleep until that's not
294 	 * true any more.
295 	 */
296 	bps = segtod(fs, 1);
297 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
298 		do {
299 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
300 			cip = *(CLEANERINFO *)bp->b_data;
301 			brelse(bp, B_INVAL);
302 
303 			if (cip.clean < 4) /* XXX magic number 4 */
304 				kops.ko_fcntl(fs->clfs_ifilefd,
305 				    LFCNSEGWAIT, NULL);
306 		} while(cip.clean < 4);
307 
308 		lim.blkiov = tbip;
309 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
310 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
311 			retval = COALESCE_BADMARKV;
312 			goto out;
313 		}
314 	}
315 
316 	retval = COALESCE_OK;
317 out:
318 	free(dip);
319 	if (bip) {
320 		for (i = 0; i < onb; i++)
321 			if (bip[i].bi_bp)
322 				free(bip[i].bi_bp);
323 		free(bip);
324 	}
325 	return retval;
326 }
327 
328 /*
329  * Try coalescing every inode in the filesystem.
330  * Return the number of inodes actually altered.
331  */
332 int clean_all_inodes(struct clfs *fs)
333 {
334 	int i, r, maxino;
335 	int totals[COALESCE_MAXERROR];
336 	struct stat st;
337 
338 	memset(totals, 0, sizeof(totals));
339 
340 	fstat(fs->clfs_ifilefd, &st);
341 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
342 		fs->lfs_segtabsz - fs->lfs_cleansz;
343 
344 	for (i = 0; i < maxino; i++) {
345 		r = clean_inode(fs, i);
346 		++totals[r];
347 	}
348 
349 	for (i = 0; i < COALESCE_MAXERROR; i++)
350 		if (totals[i])
351 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
352 			       totals[i]);
353 
354 	return totals[COALESCE_OK];
355 }
356 
357 /*
358  * Fork a child process to coalesce this fs.
359  */
360 int
361 fork_coalesce(struct clfs *fs)
362 {
363 	static pid_t childpid;
364 	int num;
365 
366 	/*
367 	 * If already running a coalescing child, don't start a new one.
368 	 */
369 	if (childpid) {
370 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
371 			childpid = 0;
372 	}
373 	if (childpid && kill(childpid, 0) >= 0) {
374 		/* already running a coalesce process */
375 		if (debug)
376 			syslog(LOG_DEBUG, "coalescing already in progress");
377 		return 0;
378 	}
379 
380 	/*
381 	 * Fork a child and let the child coalease
382 	 */
383 	childpid = fork();
384 	if (childpid < 0) {
385 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
386 		return 0;
387 	} else if (childpid == 0) {
388 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
389 		       fs->lfs_fsmnt, getpid());
390 		num = clean_all_inodes(fs);
391 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
392 		       fs->lfs_fsmnt, num);
393 		exit(0);
394 	}
395 
396 	return 0;
397 }
398