xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision fad4c9f71477ae11cea2ee75ec82151ac770a534)
1 /*      $NetBSD: coalesce.c,v 1.13 2006/05/12 19:35:27 perseant Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *      This product includes software developed by the NetBSD
21  *      Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 #include <sys/param.h>
40 #include <sys/mount.h>
41 #include <sys/time.h>
42 #include <sys/resource.h>
43 #include <sys/types.h>
44 #include <sys/wait.h>
45 #include <sys/mman.h>
46 
47 #include <ufs/ufs/dinode.h>
48 #include <ufs/lfs/lfs.h>
49 
50 #include <fcntl.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <time.h>
56 #include <unistd.h>
57 #include <util.h>
58 #include <errno.h>
59 #include <err.h>
60 
61 #include <syslog.h>
62 
63 #include "bufcache.h"
64 #include "vnode.h"
65 #include "cleaner.h"
66 
67 extern int debug, do_mmap;
68 
69 int log2int(int n)
70 {
71 	int log;
72 
73 	log = 0;
74 	while (n > 0) {
75 		++log;
76 		n >>= 1;
77 	}
78 	return log - 1;
79 }
80 
81 enum coalesce_returncodes {
82 	COALESCE_OK = 0,
83 	COALESCE_NOINODE,
84 	COALESCE_TOOSMALL,
85 	COALESCE_BADSIZE,
86 	COALESCE_BADBLOCKSIZE,
87 	COALESCE_NOMEM,
88 	COALESCE_BADBMAPV,
89 	COALESCE_BADMARKV,
90 	COALESCE_NOTWORTHIT,
91 	COALESCE_NOTHINGLEFT,
92 	COALESCE_EIO,
93 
94 	COALESCE_MAXERROR
95 };
96 
97 char *coalesce_return[] = {
98 	"Successfully coalesced",
99 	"File not in use or inode not found",
100 	"Not large enough to coalesce",
101 	"Negative size",
102 	"Not enough blocks to account for size",
103 	"Malloc failed",
104 	"LFCNBMAPV failed",
105 	"Not broken enough to fix",
106 	"Too many blocks not found",
107 	"Too many blocks found in active segments",
108 	"I/O error",
109 
110 	"No such error"
111 };
112 
113 static struct ufs1_dinode *
114 get_dinode(struct clfs *fs, ino_t ino)
115 {
116 	IFILE *ifp;
117 	daddr_t daddr;
118 	struct ubuf *bp;
119 	struct ufs1_dinode *dip, *r;
120 
121 	lfs_ientry(&ifp, fs, ino, &bp);
122 	daddr = ifp->if_daddr;
123 	brelse(bp);
124 
125 	if (daddr == 0x0)
126 		return NULL;
127 
128 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
129 	for (dip = (struct ufs1_dinode *)bp->b_data;
130 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
131 		if (dip->di_inumber == ino) {
132 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
133 			memcpy(r, dip, sizeof(*r));
134 			brelse(bp);
135 			return r;
136 		}
137 	brelse(bp);
138 	return NULL;
139 }
140 
141 /*
142  * Find out if this inode's data blocks are discontinuous; if they are,
143  * rewrite them using markv.  Return the number of inodes rewritten.
144  */
145 static int
146 clean_inode(struct clfs *fs, ino_t ino)
147 {
148 	BLOCK_INFO *bip = NULL, *tbip;
149 	CLEANERINFO cip;
150 	struct ubuf *bp;
151 	struct ufs1_dinode *dip;
152 	struct clfs_seguse *sup;
153 	struct lfs_fcntl_markv /* {
154 		BLOCK_INFO *blkiov;
155 		int blkcnt;
156 	} */ lim;
157 	daddr_t toff;
158 	int i;
159 	int nb, onb, noff;
160 	int retval;
161 	int bps;
162 
163 	dip = get_dinode(fs, ino);
164 	if (dip == NULL)
165 		return COALESCE_NOINODE;
166 
167 	/* Compute file block size, set up for bmapv */
168 	onb = nb = lblkno(fs, dip->di_size);
169 
170 	/* XXX for now, don't do any file small enough to have fragments */
171 	if (nb < NDADDR) {
172 		free(dip);
173 		return COALESCE_TOOSMALL;
174 	}
175 
176 	/* Sanity checks */
177 	if (dip->di_size < 0) {
178 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
179 		free(dip);
180 		return COALESCE_BADSIZE;
181 	}
182 	if (nb > dip->di_blocks) {
183 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
184 		     dip->di_blocks);
185 		free(dip);
186 		return COALESCE_BADBLOCKSIZE;
187 	}
188 
189 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
190 	if (bip == NULL) {
191 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
192 		    (unsigned long long)ino, nb);
193 		free(dip);
194 		return COALESCE_NOMEM;
195 	}
196 	for (i = 0; i < nb; i++) {
197 		memset(bip + i, 0, sizeof(BLOCK_INFO));
198 		bip[i].bi_inode = ino;
199 		bip[i].bi_lbn = i;
200 		bip[i].bi_version = dip->di_gen;
201 		/* Don't set the size, but let lfs_bmap fill it in */
202 	}
203 	lim.blkiov = bip;
204 	lim.blkcnt = nb;
205 	if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
206 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
207 		       fs->lfs_fsmnt);
208 		retval = COALESCE_BADBMAPV;
209 		goto out;
210 	}
211 #if 0
212 	for (i = 0; i < nb; i++) {
213 		printf("bi_size = %d, bi_ino = %d, "
214 		    "bi_lbn = %d, bi_daddr = %d\n",
215 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
216 		    bip[i].bi_daddr);
217 	}
218 #endif
219 	noff = toff = 0;
220 	for (i = 1; i < nb; i++) {
221 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
222 			++noff;
223 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
224 		    - fs->lfs_frag) >> fs->lfs_fbshift;
225 	}
226 
227 	/*
228 	 * If this file is not discontinuous, there's no point in rewriting it.
229 	 *
230 	 * Explicitly allow a certain amount of discontinuity, since large
231 	 * files will be broken among segments and medium-sized files
232 	 * can have a break or two and it's okay.
233 	 */
234 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
235 	    segtod(fs, noff) * 2 < nb) {
236 		retval = COALESCE_NOTWORTHIT;
237 		goto out;
238 	} else if (debug)
239 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
240 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
241 		    noff, (long long)toff, nb);
242 
243 	/* Search for blocks in active segments; don't move them. */
244 	for (i = 0; i < nb; i++) {
245 		if (bip[i].bi_daddr <= 0)
246 			continue;
247 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
248 		if (sup->flags & SEGUSE_ACTIVE)
249 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
250 	}
251 
252 	/*
253 	 * Get rid of any blocks we've marked dead.  If this is an older
254 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
255 	 * toss everything here.
256 	 */
257 	onb = nb;
258 	toss_old_blocks(fs, &bip, &nb, NULL);
259 	nb = i;
260 
261 	/*
262 	 * We may have tossed enough blocks that it is no longer worthwhile
263 	 * to rewrite this inode.
264 	 */
265 	if (nb == 0 || onb - nb > log2int(onb)) {
266 		if (debug)
267 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
268 		retval = COALESCE_NOTHINGLEFT;
269 		goto out;
270 	}
271 
272 	/*
273 	 * We are going to rewrite this inode.
274 	 * For any remaining blocks, read in their contents.
275 	 */
276 	for (i = 0; i < nb; i++) {
277 		bip[i].bi_bp = malloc(bip[i].bi_size);
278 		if (bip[i].bi_bp == NULL) {
279 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
280 			    bip[i].bi_size);
281 			retval = COALESCE_NOMEM;
282 			goto out;
283 		}
284 
285 		if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
286 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
287 			retval = COALESCE_EIO;
288 			goto out;
289 		}
290 	}
291 	if (debug)
292 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
293 		    (unsigned long long)ino, nb);
294 
295 	/*
296 	 * Write in segment-sized chunks.  If at any point we'd write more
297 	 * than half of the available segments, sleep until that's not
298 	 * true any more.
299 	 */
300 	bps = segtod(fs, 1);
301 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
302 		do {
303 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
304 			cip = *(CLEANERINFO *)bp->b_data;
305 			bp->b_flags |= B_INVAL;
306 			brelse(bp);
307 
308 			if (cip.clean < 4) /* XXX magic number 4 */
309 				fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
310 		} while(cip.clean < 4);
311 
312 		lim.blkiov = tbip;
313 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
314 		if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
315 			retval = COALESCE_BADMARKV;
316 			goto out;
317 		}
318 	}
319 
320 	retval = COALESCE_OK;
321 out:
322 	free(dip);
323 	if (bip) {
324 		for (i = 0; i < onb; i++)
325 			if (bip[i].bi_bp)
326 				free(bip[i].bi_bp);
327 		free(bip);
328 	}
329 	return retval;
330 }
331 
332 /*
333  * Try coalescing every inode in the filesystem.
334  * Return the number of inodes actually altered.
335  */
336 int clean_all_inodes(struct clfs *fs)
337 {
338 	int i, r, maxino;
339 	int totals[COALESCE_MAXERROR];
340 	struct stat st;
341 
342 	memset(totals, 0, sizeof(totals));
343 
344 	fstat(fs->clfs_ifilefd, &st);
345 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
346 		fs->lfs_segtabsz - fs->lfs_cleansz;
347 
348 	for (i = 0; i < maxino; i++) {
349 		r = clean_inode(fs, i);
350 		++totals[r];
351 	}
352 
353 	for (i = 0; i < COALESCE_MAXERROR; i++)
354 		if (totals[i])
355 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
356 			       totals[i]);
357 
358 	return totals[COALESCE_OK];
359 }
360 
361 /*
362  * Fork a child process to coalesce this fs.
363  */
364 int
365 fork_coalesce(struct clfs *fs)
366 {
367 	static pid_t childpid;
368 	int num;
369 
370 	/*
371 	 * If already running a coalescing child, don't start a new one.
372 	 */
373 	if (childpid) {
374 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
375 			childpid = 0;
376 	}
377 	if (childpid && kill(childpid, 0) >= 0) {
378 		/* already running a coalesce process */
379 		if (debug)
380 			syslog(LOG_DEBUG, "coalescing already in progress");
381 		return 0;
382 	}
383 
384 	/*
385 	 * Fork a child and let the child coalease
386 	 */
387 	childpid = fork();
388 	if (childpid < 0) {
389 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
390 		return 0;
391 	} else if (childpid == 0) {
392 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
393 		       fs->lfs_fsmnt, getpid());
394 		num = clean_all_inodes(fs);
395 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
396 		       fs->lfs_fsmnt, num);
397 		exit(0);
398 	}
399 
400 	return 0;
401 }
402