xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1 /*      $NetBSD: coalesce.c,v 1.23 2013/06/18 18:18:57 christos Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39 
40 #include <ufs/lfs/lfs.h>
41 
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52 
53 #include <syslog.h>
54 
55 #include "bufcache.h"
56 #include "vnode.h"
57 #include "cleaner.h"
58 #include "kernelops.h"
59 
60 extern int debug, do_mmap;
61 
62 int log2int(int n)
63 {
64 	int log;
65 
66 	log = 0;
67 	while (n > 0) {
68 		++log;
69 		n >>= 1;
70 	}
71 	return log - 1;
72 }
73 
74 enum coalesce_returncodes {
75 	COALESCE_OK = 0,
76 	COALESCE_NOINODE,
77 	COALESCE_TOOSMALL,
78 	COALESCE_BADSIZE,
79 	COALESCE_BADBLOCKSIZE,
80 	COALESCE_NOMEM,
81 	COALESCE_BADBMAPV,
82 	COALESCE_BADMARKV,
83 	COALESCE_NOTWORTHIT,
84 	COALESCE_NOTHINGLEFT,
85 	COALESCE_EIO,
86 
87 	COALESCE_MAXERROR
88 };
89 
90 const char *coalesce_return[] = {
91 	"Successfully coalesced",
92 	"File not in use or inode not found",
93 	"Not large enough to coalesce",
94 	"Negative size",
95 	"Not enough blocks to account for size",
96 	"Malloc failed",
97 	"LFCNBMAPV failed",
98 	"Not broken enough to fix",
99 	"Too many blocks not found",
100 	"Too many blocks found in active segments",
101 	"I/O error",
102 
103 	"No such error"
104 };
105 
106 static struct ulfs1_dinode *
107 get_dinode(struct clfs *fs, ino_t ino)
108 {
109 	IFILE *ifp;
110 	daddr_t daddr;
111 	struct ubuf *bp;
112 	struct ulfs1_dinode *dip, *r;
113 
114 	lfs_ientry(&ifp, fs, ino, &bp);
115 	daddr = ifp->if_daddr;
116 	brelse(bp, 0);
117 
118 	if (daddr == 0x0)
119 		return NULL;
120 
121 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
122 	for (dip = (struct ulfs1_dinode *)bp->b_data;
123 	     dip < (struct ulfs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
124 		if (dip->di_inumber == ino) {
125 			r = (struct ulfs1_dinode *)malloc(sizeof(*r));
126 			if (r == NULL)
127 				break;
128 			memcpy(r, dip, sizeof(*r));
129 			brelse(bp, 0);
130 			return r;
131 		}
132 	brelse(bp, 0);
133 	return NULL;
134 }
135 
136 /*
137  * Find out if this inode's data blocks are discontinuous; if they are,
138  * rewrite them using markv.  Return the number of inodes rewritten.
139  */
140 static int
141 clean_inode(struct clfs *fs, ino_t ino)
142 {
143 	BLOCK_INFO *bip = NULL, *tbip;
144 	CLEANERINFO cip;
145 	struct ubuf *bp;
146 	struct ulfs1_dinode *dip;
147 	struct clfs_seguse *sup;
148 	struct lfs_fcntl_markv /* {
149 		BLOCK_INFO *blkiov;
150 		int blkcnt;
151 	} */ lim;
152 	daddr_t toff;
153 	int i;
154 	int nb, onb, noff;
155 	int retval;
156 	int bps;
157 
158 	dip = get_dinode(fs, ino);
159 	if (dip == NULL)
160 		return COALESCE_NOINODE;
161 
162 	/* Compute file block size, set up for bmapv */
163 	onb = nb = lfs_lblkno(fs, dip->di_size);
164 
165 	/* XXX for now, don't do any file small enough to have fragments */
166 	if (nb < ULFS_NDADDR) {
167 		free(dip);
168 		return COALESCE_TOOSMALL;
169 	}
170 
171 	/* Sanity checks */
172 #if 0	/* di_size is uint64_t -- this is a noop */
173 	if (dip->di_size < 0) {
174 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
175 		free(dip);
176 		return COALESCE_BADSIZE;
177 	}
178 #endif
179 	if (nb > dip->di_blocks) {
180 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
181 		     dip->di_blocks);
182 		free(dip);
183 		return COALESCE_BADBLOCKSIZE;
184 	}
185 
186 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
187 	if (bip == NULL) {
188 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
189 		    (unsigned long long)ino, nb);
190 		free(dip);
191 		return COALESCE_NOMEM;
192 	}
193 	for (i = 0; i < nb; i++) {
194 		memset(bip + i, 0, sizeof(BLOCK_INFO));
195 		bip[i].bi_inode = ino;
196 		bip[i].bi_lbn = i;
197 		bip[i].bi_version = dip->di_gen;
198 		/* Don't set the size, but let lfs_bmap fill it in */
199 	}
200 	lim.blkiov = bip;
201 	lim.blkcnt = nb;
202 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
203 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
204 		       fs->lfs_fsmnt);
205 		retval = COALESCE_BADBMAPV;
206 		goto out;
207 	}
208 #if 0
209 	for (i = 0; i < nb; i++) {
210 		printf("bi_size = %d, bi_ino = %d, "
211 		    "bi_lbn = %d, bi_daddr = %d\n",
212 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
213 		    bip[i].bi_daddr);
214 	}
215 #endif
216 	noff = toff = 0;
217 	for (i = 1; i < nb; i++) {
218 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
219 			++noff;
220 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
221 		    - fs->lfs_frag) >> fs->lfs_fbshift;
222 	}
223 
224 	/*
225 	 * If this file is not discontinuous, there's no point in rewriting it.
226 	 *
227 	 * Explicitly allow a certain amount of discontinuity, since large
228 	 * files will be broken among segments and medium-sized files
229 	 * can have a break or two and it's okay.
230 	 */
231 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
232 	    lfs_segtod(fs, noff) * 2 < nb) {
233 		retval = COALESCE_NOTWORTHIT;
234 		goto out;
235 	} else if (debug)
236 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
237 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
238 		    noff, (long long)toff, nb);
239 
240 	/* Search for blocks in active segments; don't move them. */
241 	for (i = 0; i < nb; i++) {
242 		if (bip[i].bi_daddr <= 0)
243 			continue;
244 		sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
245 		if (sup->flags & SEGUSE_ACTIVE)
246 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
247 	}
248 
249 	/*
250 	 * Get rid of any blocks we've marked dead.  If this is an older
251 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
252 	 * toss everything here.
253 	 */
254 	onb = nb;
255 	toss_old_blocks(fs, &bip, &nb, NULL);
256 	nb = i;
257 
258 	/*
259 	 * We may have tossed enough blocks that it is no longer worthwhile
260 	 * to rewrite this inode.
261 	 */
262 	if (nb == 0 || onb - nb > log2int(onb)) {
263 		if (debug)
264 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
265 		retval = COALESCE_NOTHINGLEFT;
266 		goto out;
267 	}
268 
269 	/*
270 	 * We are going to rewrite this inode.
271 	 * For any remaining blocks, read in their contents.
272 	 */
273 	for (i = 0; i < nb; i++) {
274 		bip[i].bi_bp = malloc(bip[i].bi_size);
275 		if (bip[i].bi_bp == NULL) {
276 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
277 			    bip[i].bi_size);
278 			retval = COALESCE_NOMEM;
279 			goto out;
280 		}
281 
282 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
283 			  lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
284 			retval = COALESCE_EIO;
285 			goto out;
286 		}
287 	}
288 	if (debug)
289 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
290 		    (unsigned long long)ino, nb);
291 
292 	/*
293 	 * Write in segment-sized chunks.  If at any point we'd write more
294 	 * than half of the available segments, sleep until that's not
295 	 * true any more.
296 	 */
297 	bps = lfs_segtod(fs, 1);
298 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
299 		do {
300 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
301 			cip = *(CLEANERINFO *)bp->b_data;
302 			brelse(bp, B_INVAL);
303 
304 			if (cip.clean < 4) /* XXX magic number 4 */
305 				kops.ko_fcntl(fs->clfs_ifilefd,
306 				    LFCNSEGWAIT, NULL);
307 		} while(cip.clean < 4);
308 
309 		lim.blkiov = tbip;
310 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
311 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
312 			retval = COALESCE_BADMARKV;
313 			goto out;
314 		}
315 	}
316 
317 	retval = COALESCE_OK;
318 out:
319 	free(dip);
320 	if (bip) {
321 		for (i = 0; i < onb; i++)
322 			if (bip[i].bi_bp)
323 				free(bip[i].bi_bp);
324 		free(bip);
325 	}
326 	return retval;
327 }
328 
329 /*
330  * Try coalescing every inode in the filesystem.
331  * Return the number of inodes actually altered.
332  */
333 int clean_all_inodes(struct clfs *fs)
334 {
335 	int i, r, maxino;
336 	int totals[COALESCE_MAXERROR];
337 	struct stat st;
338 
339 	memset(totals, 0, sizeof(totals));
340 
341 	fstat(fs->clfs_ifilefd, &st);
342 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
343 		fs->lfs_segtabsz - fs->lfs_cleansz;
344 
345 	for (i = 0; i < maxino; i++) {
346 		r = clean_inode(fs, i);
347 		++totals[r];
348 	}
349 
350 	for (i = 0; i < COALESCE_MAXERROR; i++)
351 		if (totals[i])
352 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
353 			       totals[i]);
354 
355 	return totals[COALESCE_OK];
356 }
357 
358 /*
359  * Fork a child process to coalesce this fs.
360  */
361 int
362 fork_coalesce(struct clfs *fs)
363 {
364 	static pid_t childpid;
365 	int num;
366 
367 	/*
368 	 * If already running a coalescing child, don't start a new one.
369 	 */
370 	if (childpid) {
371 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
372 			childpid = 0;
373 	}
374 	if (childpid && kill(childpid, 0) >= 0) {
375 		/* already running a coalesce process */
376 		if (debug)
377 			syslog(LOG_DEBUG, "coalescing already in progress");
378 		return 0;
379 	}
380 
381 	/*
382 	 * Fork a child and let the child coalease
383 	 */
384 	childpid = fork();
385 	if (childpid < 0) {
386 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
387 		return 0;
388 	} else if (childpid == 0) {
389 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
390 		       fs->lfs_fsmnt, getpid());
391 		num = clean_all_inodes(fs);
392 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
393 		       fs->lfs_fsmnt, num);
394 		exit(0);
395 	}
396 
397 	return 0;
398 }
399