xref: /netbsd-src/libexec/lfs_cleanerd/coalesce.c (revision 4f6e0f51f329e33757b9dddc9b0c977bbc5212f9)
1 /*      $NetBSD: coalesce.c,v 1.33 2015/10/10 22:34:46 dholland Exp $  */
2 
3 /*-
4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39 
40 #include <ufs/lfs/lfs.h>
41 
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52 #include <assert.h>
53 
54 #include <syslog.h>
55 
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60 
61 extern int debug, do_mmap;
62 
63 /*
64  * XXX return the arg to just int when/if we don't need it for
65  * potentially huge block counts any more.
66  */
67 static int
log2int(intmax_t n)68 log2int(intmax_t n)
69 {
70 	int log;
71 
72 	log = 0;
73 	while (n > 0) {
74 		++log;
75 		n >>= 1;
76 	}
77 	return log - 1;
78 }
79 
80 enum coalesce_returncodes {
81 	COALESCE_OK = 0,
82 	COALESCE_NOINODE,
83 	COALESCE_TOOSMALL,
84 	COALESCE_BADSIZE,
85 	COALESCE_BADBLOCKSIZE,
86 	COALESCE_NOMEM,
87 	COALESCE_BADBMAPV,
88 	COALESCE_BADMARKV,
89 	COALESCE_NOTWORTHIT,
90 	COALESCE_NOTHINGLEFT,
91 	COALESCE_EIO,
92 
93 	COALESCE_MAXERROR
94 };
95 
96 const char *coalesce_return[] = {
97 	"Successfully coalesced",
98 	"File not in use or inode not found",
99 	"Not large enough to coalesce",
100 	"Negative size",
101 	"Not enough blocks to account for size",
102 	"Malloc failed",
103 	"LFCNBMAPV failed",
104 	"Not broken enough to fix",
105 	"Too many blocks not found",
106 	"Too many blocks found in active segments",
107 	"I/O error",
108 
109 	"No such error"
110 };
111 
112 static union lfs_dinode *
get_dinode(struct clfs * fs,ino_t ino)113 get_dinode(struct clfs *fs, ino_t ino)
114 {
115 	IFILE *ifp;
116 	daddr_t daddr;
117 	struct ubuf *bp;
118 	union lfs_dinode *dip, *r;
119 	unsigned i;
120 
121 	lfs_ientry(&ifp, fs, ino, &bp);
122 	daddr = lfs_if_getdaddr(fs, ifp);
123 	brelse(bp, 0);
124 
125 	if (daddr == 0x0)
126 		return NULL;
127 
128 	bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
129 	for (i = 0; i < LFS_INOPB(fs); i++) {
130 		dip = DINO_IN_BLOCK(fs, bp->b_data, i);
131 		if (lfs_dino_getinumber(fs, dip) == ino) {
132 			r = malloc(sizeof(*r));
133 			if (r == NULL)
134 				break;
135 			/*
136 			 * Don't just assign the union, as if we're
137 			 * 32-bit and it's the last inode in the block
138 			 * that will run off the end of the buffer.
139 			 */
140 			if (fs->lfs_is64) {
141 				r->u_64 = dip->u_64;
142 			} else {
143 				r->u_32 = dip->u_32;
144 			}
145 			brelse(bp, 0);
146 			return r;
147 		}
148 	}
149 	brelse(bp, 0);
150 	return NULL;
151 }
152 
153 /*
154  * Find out if this inode's data blocks are discontinuous; if they are,
155  * rewrite them using markv.  Return the number of inodes rewritten.
156  */
157 static int
clean_inode(struct clfs * fs,ino_t ino)158 clean_inode(struct clfs *fs, ino_t ino)
159 {
160 	BLOCK_INFO *bip = NULL, *tbip;
161 	CLEANERINFO cip;
162 	struct ubuf *bp;
163 	union lfs_dinode *dip;
164 	struct clfs_seguse *sup;
165 	struct lfs_fcntl_markv /* {
166 		BLOCK_INFO *blkiov;
167 		int blkcnt;
168 	} */ lim;
169 	daddr_t toff;
170 	int noff;
171 	blkcnt_t i, nb, onb;
172 	int retval;
173 	int bps;
174 
175 	dip = get_dinode(fs, ino);
176 	if (dip == NULL)
177 		return COALESCE_NOINODE;
178 
179 	/* Compute file block size, set up for bmapv */
180 	onb = nb = lfs_lblkno(fs, lfs_dino_getsize(fs, dip));
181 
182 	/* XXX for now, don't do any file small enough to have fragments */
183 	if (nb < ULFS_NDADDR) {
184 		free(dip);
185 		return COALESCE_TOOSMALL;
186 	}
187 
188 	/* Sanity checks */
189 #if 0	/* di_size is uint64_t -- this is a noop */
190 	if (lfs_dino_getsize(fs, dip) < 0) {
191 		dlog("ino %d, negative size (%" PRId64 ")", ino,
192 		     lfs_dino_getsize(fs, dip));
193 		free(dip);
194 		return COALESCE_BADSIZE;
195 	}
196 #endif
197 	if (nb > lfs_dino_getblocks(fs, dip)) {
198 		dlog("ino %ju, computed blocks %jd > held blocks %ju",
199 		     (uintmax_t)ino, (intmax_t)nb,
200 		     (uintmax_t)lfs_dino_getblocks(fs, dip));
201 		free(dip);
202 		return COALESCE_BADBLOCKSIZE;
203 	}
204 
205 	/*
206 	 * XXX: We should really coalesce really large files in
207 	 * chunks, as there's substantial diminishing returns and
208 	 * mallocing huge amounts of memory just to get those returns
209 	 * is pretty silly. But that requires a big rework of this
210 	 * code. (On the plus side though then we can stop worrying
211 	 * about block counts > 2^31.)
212 	 */
213 
214 	/* ugh, no DADDR_T_MAX */
215 	__CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
216 	if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
217 		syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
218 		       (uintmax_t)ino, (uintmax_t)nb);
219 		free(dip);
220 		return COALESCE_NOMEM;
221 	}
222 
223 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
224 	if (bip == NULL) {
225 		syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
226 		    (unsigned long long)ino, (intmax_t)nb,
227 		    strerror(errno));
228 		free(dip);
229 		return COALESCE_NOMEM;
230 	}
231 	for (i = 0; i < nb; i++) {
232 		memset(bip + i, 0, sizeof(BLOCK_INFO));
233 		bip[i].bi_inode = ino;
234 		bip[i].bi_lbn = i;
235 		bip[i].bi_version = lfs_dino_getgen(fs, dip);
236 		/* Don't set the size, but let lfs_bmap fill it in */
237 	}
238 	/*
239 	 * The kernel also contains this check; but as lim.blkcnt is
240 	 * only 32 bits wide, we need to check ourselves too in case
241 	 * we'd otherwise truncate a value > 2^31, as that might
242 	 * succeed and create bizarre results.
243 	 */
244 	if (nb > LFS_MARKV_MAXBLKCNT) {
245 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
246 		       lfs_sb_getfsmnt(fs));
247 		retval = COALESCE_BADBMAPV;
248 		goto out;
249 	}
250 	lim.blkiov = bip;
251 	lim.blkcnt = nb;
252 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
253 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
254 		       lfs_sb_getfsmnt(fs));
255 		retval = COALESCE_BADBMAPV;
256 		goto out;
257 	}
258 #if 0
259 	for (i = 0; i < nb; i++) {
260 		printf("bi_size = %d, bi_ino = %ju, "
261 		    "bi_lbn = %jd, bi_daddr = %jd\n",
262 		    bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
263 		    (intmax_t)bip[i].bi_lbn,
264 		    (intmax_t)bip[i].bi_daddr);
265 	}
266 #endif
267 	noff = 0;
268 	toff = 0;
269 	for (i = 1; i < nb; i++) {
270 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
271 			++noff;
272 		toff += llabs(bip[i].bi_daddr - bip[i - 1].bi_daddr
273 		    - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
274 	}
275 
276 	/*
277 	 * If this file is not discontinuous, there's no point in rewriting it.
278 	 *
279 	 * Explicitly allow a certain amount of discontinuity, since large
280 	 * files will be broken among segments and medium-sized files
281 	 * can have a break or two and it's okay.
282 	 */
283 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
284 	    lfs_segtod(fs, noff) * 2 < nb) {
285 		retval = COALESCE_NOTWORTHIT;
286 		goto out;
287 	} else if (debug)
288 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
289 		    "%d (%jd) for %jd blocks", (unsigned long long)ino,
290 		    noff, (intmax_t)toff, (intmax_t)nb);
291 
292 	/* Search for blocks in active segments; don't move them. */
293 	for (i = 0; i < nb; i++) {
294 		if (bip[i].bi_daddr <= 0)
295 			continue;
296 		sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
297 		if (sup->flags & SEGUSE_ACTIVE)
298 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
299 	}
300 
301 	/*
302 	 * Get rid of any blocks we've marked dead.  If this is an older
303 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
304 	 * toss everything here.
305 	 */
306 	onb = nb;
307 	toss_old_blocks(fs, &bip, &nb, NULL);
308 	nb = i;
309 
310 	/*
311 	 * We may have tossed enough blocks that it is no longer worthwhile
312 	 * to rewrite this inode.
313 	 */
314 	if (nb == 0 || onb - nb > log2int(onb)) {
315 		if (debug)
316 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
317 		retval = COALESCE_NOTHINGLEFT;
318 		goto out;
319 	}
320 
321 	/*
322 	 * We are going to rewrite this inode.
323 	 * For any remaining blocks, read in their contents.
324 	 */
325 	for (i = 0; i < nb; i++) {
326 		bip[i].bi_bp = malloc(bip[i].bi_size);
327 		if (bip[i].bi_bp == NULL) {
328 			syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
329 			    bip[i].bi_size, strerror(errno));
330 			retval = COALESCE_NOMEM;
331 			goto out;
332 		}
333 
334 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
335 			  lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
336 			retval = COALESCE_EIO;
337 			goto out;
338 		}
339 	}
340 	if (debug)
341 		syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
342 		    (uintmax_t)ino, (intmax_t)nb);
343 
344 	/*
345 	 * Write in segment-sized chunks.  If at any point we'd write more
346 	 * than half of the available segments, sleep until that's not
347 	 * true any more.
348 	 *
349 	 * XXX the pointer arithmetic in this loop is illegal; replace
350 	 * TBIP with an integer (blkcnt_t) offset.
351 	 */
352 	bps = lfs_segtod(fs, 1);
353 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
354 		do {
355 			bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
356 			cip = *(CLEANERINFO *)bp->b_data;
357 			brelse(bp, B_INVAL);
358 
359 			if (lfs_ci_getclean(fs, &cip) < 4) /* XXX magic number 4 */
360 				kops.ko_fcntl(fs->clfs_ifilefd,
361 				    LFCNSEGWAIT, NULL);
362 		} while (lfs_ci_getclean(fs, &cip) < 4);
363 
364 		/*
365 		 * Note that although lim.blkcnt is 32 bits wide, bps
366 		 * (which is blocks-per-segment) is < 2^32 so the
367 		 * value assigned here is always in range.
368 		 */
369 		lim.blkiov = tbip;
370 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
371 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
372 			retval = COALESCE_BADMARKV;
373 			goto out;
374 		}
375 	}
376 
377 	retval = COALESCE_OK;
378 out:
379 	free(dip);
380 	if (bip) {
381 		for (i = 0; i < onb; i++)
382 			if (bip[i].bi_bp)
383 				free(bip[i].bi_bp);
384 		free(bip);
385 	}
386 	return retval;
387 }
388 
389 /*
390  * Try coalescing every inode in the filesystem.
391  * Return the number of inodes actually altered.
392  */
clean_all_inodes(struct clfs * fs)393 int clean_all_inodes(struct clfs *fs)
394 {
395 	int i, r, maxino;
396 	int totals[COALESCE_MAXERROR];
397 	struct stat st;
398 
399 	memset(totals, 0, sizeof(totals));
400 
401 	fstat(fs->clfs_ifilefd, &st);
402 	maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
403 		lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
404 
405 	for (i = 0; i < maxino; i++) {
406 		r = clean_inode(fs, i);
407 		++totals[r];
408 	}
409 
410 	for (i = 0; i < COALESCE_MAXERROR; i++)
411 		if (totals[i])
412 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
413 			       totals[i]);
414 
415 	return totals[COALESCE_OK];
416 }
417 
418 /*
419  * Fork a child process to coalesce this fs.
420  */
421 int
fork_coalesce(struct clfs * fs)422 fork_coalesce(struct clfs *fs)
423 {
424 	static pid_t childpid;
425 	int num;
426 
427 	/*
428 	 * If already running a coalescing child, don't start a new one.
429 	 */
430 	if (childpid) {
431 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
432 			childpid = 0;
433 	}
434 	if (childpid && kill(childpid, 0) >= 0) {
435 		/* already running a coalesce process */
436 		if (debug)
437 			syslog(LOG_DEBUG, "coalescing already in progress");
438 		return 0;
439 	}
440 
441 	/*
442 	 * Fork a child and let the child coalease
443 	 */
444 	childpid = fork();
445 	if (childpid < 0) {
446 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
447 		return 0;
448 	} else if (childpid == 0) {
449 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
450 		       lfs_sb_getfsmnt(fs), getpid());
451 		num = clean_all_inodes(fs);
452 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
453 		       lfs_sb_getfsmnt(fs), num);
454 		exit(0);
455 	}
456 
457 	return 0;
458 }
459