1 /* $NetBSD: coalesce.c,v 1.33 2015/10/10 22:34:46 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39
40 #include <ufs/lfs/lfs.h>
41
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52 #include <assert.h>
53
54 #include <syslog.h>
55
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60
61 extern int debug, do_mmap;
62
63 /*
64 * XXX return the arg to just int when/if we don't need it for
65 * potentially huge block counts any more.
66 */
67 static int
log2int(intmax_t n)68 log2int(intmax_t n)
69 {
70 int log;
71
72 log = 0;
73 while (n > 0) {
74 ++log;
75 n >>= 1;
76 }
77 return log - 1;
78 }
79
80 enum coalesce_returncodes {
81 COALESCE_OK = 0,
82 COALESCE_NOINODE,
83 COALESCE_TOOSMALL,
84 COALESCE_BADSIZE,
85 COALESCE_BADBLOCKSIZE,
86 COALESCE_NOMEM,
87 COALESCE_BADBMAPV,
88 COALESCE_BADMARKV,
89 COALESCE_NOTWORTHIT,
90 COALESCE_NOTHINGLEFT,
91 COALESCE_EIO,
92
93 COALESCE_MAXERROR
94 };
95
96 const char *coalesce_return[] = {
97 "Successfully coalesced",
98 "File not in use or inode not found",
99 "Not large enough to coalesce",
100 "Negative size",
101 "Not enough blocks to account for size",
102 "Malloc failed",
103 "LFCNBMAPV failed",
104 "Not broken enough to fix",
105 "Too many blocks not found",
106 "Too many blocks found in active segments",
107 "I/O error",
108
109 "No such error"
110 };
111
112 static union lfs_dinode *
get_dinode(struct clfs * fs,ino_t ino)113 get_dinode(struct clfs *fs, ino_t ino)
114 {
115 IFILE *ifp;
116 daddr_t daddr;
117 struct ubuf *bp;
118 union lfs_dinode *dip, *r;
119 unsigned i;
120
121 lfs_ientry(&ifp, fs, ino, &bp);
122 daddr = lfs_if_getdaddr(fs, ifp);
123 brelse(bp, 0);
124
125 if (daddr == 0x0)
126 return NULL;
127
128 bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
129 for (i = 0; i < LFS_INOPB(fs); i++) {
130 dip = DINO_IN_BLOCK(fs, bp->b_data, i);
131 if (lfs_dino_getinumber(fs, dip) == ino) {
132 r = malloc(sizeof(*r));
133 if (r == NULL)
134 break;
135 /*
136 * Don't just assign the union, as if we're
137 * 32-bit and it's the last inode in the block
138 * that will run off the end of the buffer.
139 */
140 if (fs->lfs_is64) {
141 r->u_64 = dip->u_64;
142 } else {
143 r->u_32 = dip->u_32;
144 }
145 brelse(bp, 0);
146 return r;
147 }
148 }
149 brelse(bp, 0);
150 return NULL;
151 }
152
153 /*
154 * Find out if this inode's data blocks are discontinuous; if they are,
155 * rewrite them using markv. Return the number of inodes rewritten.
156 */
157 static int
clean_inode(struct clfs * fs,ino_t ino)158 clean_inode(struct clfs *fs, ino_t ino)
159 {
160 BLOCK_INFO *bip = NULL, *tbip;
161 CLEANERINFO cip;
162 struct ubuf *bp;
163 union lfs_dinode *dip;
164 struct clfs_seguse *sup;
165 struct lfs_fcntl_markv /* {
166 BLOCK_INFO *blkiov;
167 int blkcnt;
168 } */ lim;
169 daddr_t toff;
170 int noff;
171 blkcnt_t i, nb, onb;
172 int retval;
173 int bps;
174
175 dip = get_dinode(fs, ino);
176 if (dip == NULL)
177 return COALESCE_NOINODE;
178
179 /* Compute file block size, set up for bmapv */
180 onb = nb = lfs_lblkno(fs, lfs_dino_getsize(fs, dip));
181
182 /* XXX for now, don't do any file small enough to have fragments */
183 if (nb < ULFS_NDADDR) {
184 free(dip);
185 return COALESCE_TOOSMALL;
186 }
187
188 /* Sanity checks */
189 #if 0 /* di_size is uint64_t -- this is a noop */
190 if (lfs_dino_getsize(fs, dip) < 0) {
191 dlog("ino %d, negative size (%" PRId64 ")", ino,
192 lfs_dino_getsize(fs, dip));
193 free(dip);
194 return COALESCE_BADSIZE;
195 }
196 #endif
197 if (nb > lfs_dino_getblocks(fs, dip)) {
198 dlog("ino %ju, computed blocks %jd > held blocks %ju",
199 (uintmax_t)ino, (intmax_t)nb,
200 (uintmax_t)lfs_dino_getblocks(fs, dip));
201 free(dip);
202 return COALESCE_BADBLOCKSIZE;
203 }
204
205 /*
206 * XXX: We should really coalesce really large files in
207 * chunks, as there's substantial diminishing returns and
208 * mallocing huge amounts of memory just to get those returns
209 * is pretty silly. But that requires a big rework of this
210 * code. (On the plus side though then we can stop worrying
211 * about block counts > 2^31.)
212 */
213
214 /* ugh, no DADDR_T_MAX */
215 __CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
216 if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
217 syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
218 (uintmax_t)ino, (uintmax_t)nb);
219 free(dip);
220 return COALESCE_NOMEM;
221 }
222
223 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
224 if (bip == NULL) {
225 syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
226 (unsigned long long)ino, (intmax_t)nb,
227 strerror(errno));
228 free(dip);
229 return COALESCE_NOMEM;
230 }
231 for (i = 0; i < nb; i++) {
232 memset(bip + i, 0, sizeof(BLOCK_INFO));
233 bip[i].bi_inode = ino;
234 bip[i].bi_lbn = i;
235 bip[i].bi_version = lfs_dino_getgen(fs, dip);
236 /* Don't set the size, but let lfs_bmap fill it in */
237 }
238 /*
239 * The kernel also contains this check; but as lim.blkcnt is
240 * only 32 bits wide, we need to check ourselves too in case
241 * we'd otherwise truncate a value > 2^31, as that might
242 * succeed and create bizarre results.
243 */
244 if (nb > LFS_MARKV_MAXBLKCNT) {
245 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
246 lfs_sb_getfsmnt(fs));
247 retval = COALESCE_BADBMAPV;
248 goto out;
249 }
250 lim.blkiov = bip;
251 lim.blkcnt = nb;
252 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
253 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
254 lfs_sb_getfsmnt(fs));
255 retval = COALESCE_BADBMAPV;
256 goto out;
257 }
258 #if 0
259 for (i = 0; i < nb; i++) {
260 printf("bi_size = %d, bi_ino = %ju, "
261 "bi_lbn = %jd, bi_daddr = %jd\n",
262 bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
263 (intmax_t)bip[i].bi_lbn,
264 (intmax_t)bip[i].bi_daddr);
265 }
266 #endif
267 noff = 0;
268 toff = 0;
269 for (i = 1; i < nb; i++) {
270 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
271 ++noff;
272 toff += llabs(bip[i].bi_daddr - bip[i - 1].bi_daddr
273 - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
274 }
275
276 /*
277 * If this file is not discontinuous, there's no point in rewriting it.
278 *
279 * Explicitly allow a certain amount of discontinuity, since large
280 * files will be broken among segments and medium-sized files
281 * can have a break or two and it's okay.
282 */
283 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
284 lfs_segtod(fs, noff) * 2 < nb) {
285 retval = COALESCE_NOTWORTHIT;
286 goto out;
287 } else if (debug)
288 syslog(LOG_DEBUG, "ino %llu total discontinuity "
289 "%d (%jd) for %jd blocks", (unsigned long long)ino,
290 noff, (intmax_t)toff, (intmax_t)nb);
291
292 /* Search for blocks in active segments; don't move them. */
293 for (i = 0; i < nb; i++) {
294 if (bip[i].bi_daddr <= 0)
295 continue;
296 sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
297 if (sup->flags & SEGUSE_ACTIVE)
298 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
299 }
300
301 /*
302 * Get rid of any blocks we've marked dead. If this is an older
303 * kernel that doesn't have bmapv fill in the block sizes, we'll
304 * toss everything here.
305 */
306 onb = nb;
307 toss_old_blocks(fs, &bip, &nb, NULL);
308 nb = i;
309
310 /*
311 * We may have tossed enough blocks that it is no longer worthwhile
312 * to rewrite this inode.
313 */
314 if (nb == 0 || onb - nb > log2int(onb)) {
315 if (debug)
316 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
317 retval = COALESCE_NOTHINGLEFT;
318 goto out;
319 }
320
321 /*
322 * We are going to rewrite this inode.
323 * For any remaining blocks, read in their contents.
324 */
325 for (i = 0; i < nb; i++) {
326 bip[i].bi_bp = malloc(bip[i].bi_size);
327 if (bip[i].bi_bp == NULL) {
328 syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
329 bip[i].bi_size, strerror(errno));
330 retval = COALESCE_NOMEM;
331 goto out;
332 }
333
334 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
335 lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
336 retval = COALESCE_EIO;
337 goto out;
338 }
339 }
340 if (debug)
341 syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
342 (uintmax_t)ino, (intmax_t)nb);
343
344 /*
345 * Write in segment-sized chunks. If at any point we'd write more
346 * than half of the available segments, sleep until that's not
347 * true any more.
348 *
349 * XXX the pointer arithmetic in this loop is illegal; replace
350 * TBIP with an integer (blkcnt_t) offset.
351 */
352 bps = lfs_segtod(fs, 1);
353 for (tbip = bip; tbip < bip + nb; tbip += bps) {
354 do {
355 bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
356 cip = *(CLEANERINFO *)bp->b_data;
357 brelse(bp, B_INVAL);
358
359 if (lfs_ci_getclean(fs, &cip) < 4) /* XXX magic number 4 */
360 kops.ko_fcntl(fs->clfs_ifilefd,
361 LFCNSEGWAIT, NULL);
362 } while (lfs_ci_getclean(fs, &cip) < 4);
363
364 /*
365 * Note that although lim.blkcnt is 32 bits wide, bps
366 * (which is blocks-per-segment) is < 2^32 so the
367 * value assigned here is always in range.
368 */
369 lim.blkiov = tbip;
370 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
371 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
372 retval = COALESCE_BADMARKV;
373 goto out;
374 }
375 }
376
377 retval = COALESCE_OK;
378 out:
379 free(dip);
380 if (bip) {
381 for (i = 0; i < onb; i++)
382 if (bip[i].bi_bp)
383 free(bip[i].bi_bp);
384 free(bip);
385 }
386 return retval;
387 }
388
389 /*
390 * Try coalescing every inode in the filesystem.
391 * Return the number of inodes actually altered.
392 */
clean_all_inodes(struct clfs * fs)393 int clean_all_inodes(struct clfs *fs)
394 {
395 int i, r, maxino;
396 int totals[COALESCE_MAXERROR];
397 struct stat st;
398
399 memset(totals, 0, sizeof(totals));
400
401 fstat(fs->clfs_ifilefd, &st);
402 maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
403 lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
404
405 for (i = 0; i < maxino; i++) {
406 r = clean_inode(fs, i);
407 ++totals[r];
408 }
409
410 for (i = 0; i < COALESCE_MAXERROR; i++)
411 if (totals[i])
412 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
413 totals[i]);
414
415 return totals[COALESCE_OK];
416 }
417
418 /*
419 * Fork a child process to coalesce this fs.
420 */
421 int
fork_coalesce(struct clfs * fs)422 fork_coalesce(struct clfs *fs)
423 {
424 static pid_t childpid;
425 int num;
426
427 /*
428 * If already running a coalescing child, don't start a new one.
429 */
430 if (childpid) {
431 if (waitpid(childpid, NULL, WNOHANG) == childpid)
432 childpid = 0;
433 }
434 if (childpid && kill(childpid, 0) >= 0) {
435 /* already running a coalesce process */
436 if (debug)
437 syslog(LOG_DEBUG, "coalescing already in progress");
438 return 0;
439 }
440
441 /*
442 * Fork a child and let the child coalease
443 */
444 childpid = fork();
445 if (childpid < 0) {
446 syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
447 return 0;
448 } else if (childpid == 0) {
449 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
450 lfs_sb_getfsmnt(fs), getpid());
451 num = clean_all_inodes(fs);
452 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
453 lfs_sb_getfsmnt(fs), num);
454 exit(0);
455 }
456
457 return 0;
458 }
459