xref: /netbsd-src/sbin/dump/rcache.c (revision 3816d47b2c42fcd6e549e3407f842a5b1a1d23ad)
1 /*	$NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Martin J. Laubach <mjl@emsi.priv.at> and
9  *    Manuel Bouyer <Manuel.Bouyer@lip6.fr>.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __RCSID("$NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $");
36 #endif /* not lint */
37 
38 #include <sys/types.h>
39 #include <sys/uio.h>
40 #include <sys/mman.h>
41 #include <sys/param.h>
42 #include <sys/sysctl.h>
43 #include <ufs/ufs/dinode.h>
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <unistd.h>
48 #include <fcntl.h>
49 #include <errno.h>
50 #include <string.h>
51 
52 #include "dump.h"
53 
54 /*-----------------------------------------------------------------------*/
55 #define MAXCACHEBUFS	512	/* max 512 buffers */
56 #define MAXMEMPART	6	/* max 15% of the user mem */
57 
58 /*-----------------------------------------------------------------------*/
59 union cdesc {
60 	volatile size_t cd_count;
61 	struct {
62 		volatile daddr_t blkstart;
63 		volatile daddr_t blkend;	/* start + nblksread */
64 		volatile daddr_t blocksRead;
65 		volatile size_t time;
66 #ifdef DIAGNOSTICS
67 		volatile pid_t owner;
68 #endif
69 	} desc;
70 #define cd_blkstart	desc.blkstart
71 #define cd_blkend	desc.blkend
72 #define cd_blocksRead	desc.blocksRead
73 #define cd_time		desc.time
74 #define cd_owner	desc.owner
75 };
76 
77 static int findlru(void);
78 
79 static void *shareBuffer = NULL;
80 static union cdesc *cheader;
81 static union cdesc *cdesc;
82 static char *cdata;
83 static int cachebufs;
84 static int nblksread;
85 
86 #ifdef STATS
87 static int nreads;
88 static int nphysread;
89 static int64_t readsize;
90 static int64_t physreadsize;
91 #endif
92 
93 #define	CSIZE		(nblksread << dev_bshift)	/* cache buf size */
94 #define	CDATA(desc)	(cdata + ((desc) - cdesc) * CSIZE)
95 
96 void
97 initcache(int cachesize, int readblksize)
98 {
99 	size_t len;
100 	size_t sharedSize;
101 
102 	/* Convert read block size in terms of filesystem block size */
103 	nblksread = howmany(readblksize, ufsib->ufs_bsize);
104 
105 	/* Then, convert it in terms of device block size */
106 	nblksread <<= ufsib->ufs_bshift - dev_bshift;
107 
108 	if (cachesize == -1) {	/* Compute from memory available */
109 		uint64_t usermem;
110 		int mib[2] = { CTL_HW, HW_USERMEM64 };
111 
112 		len = sizeof(usermem);
113 		if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
114 			msg("sysctl(hw.usermem) failed: %s\n",
115 			    strerror(errno));
116 			return;
117 		}
118 		cachebufs = (usermem / MAXMEMPART) / CSIZE;
119 	} else {		/* User specified */
120 		cachebufs = cachesize;
121 	}
122 
123 	if (cachebufs) {	/* Don't allocate if zero --> no caching */
124 		if (cachebufs > MAXCACHEBUFS)
125 			cachebufs = MAXCACHEBUFS;
126 
127 		sharedSize = sizeof(union cdesc) +
128 	   	    sizeof(union cdesc) * cachebufs +
129 	   	    cachebufs * CSIZE;
130 #ifdef STATS
131 		fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
132 	   	    sharedSize);
133 #endif
134 		shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
135 	   	    MAP_ANON | MAP_SHARED, -1, 0);
136 		if (shareBuffer == MAP_FAILED) {
137 			msg("can't mmap shared memory for buffer: %s\n",
138 			    strerror(errno));
139 			return;
140 		}
141 		cheader = shareBuffer;
142 		cdesc = (union cdesc *) (((char *) shareBuffer) +
143 		    sizeof(union cdesc));
144 		cdata = ((char *) shareBuffer) + sizeof(union cdesc) +
145 	   	    sizeof(union cdesc) * cachebufs;
146 
147 		memset(shareBuffer, '\0', sharedSize);
148 	}
149 }
150 
151 /*
152  * Find the cache buffer descriptor that shows the minimal access time
153  */
154 static int
155 findlru(void)
156 {
157 	int	i;
158 	size_t	minTime = cdesc[0].cd_time;
159 	int	minIdx = 0;
160 
161 	for (i = 0; i < cachebufs; i++) {
162 		if (cdesc[i].cd_time < minTime) {
163 			minIdx = i;
164 			minTime = cdesc[i].cd_time;
165 		}
166 	}
167 
168 	return minIdx;
169 }
170 
171 /*
172  * Read data directly from disk, with smart error handling.
173  * Try to recover from hard errors by reading in sector sized pieces.
174  * Error recovery is attempted at most BREADEMAX times before seeking
175  * consent from the operator to continue.
176  */
177 
178 static int breaderrors = 0;
179 #define BREADEMAX 32
180 
181 void
182 rawread(daddr_t blkno, char *buf, int size)
183 {
184 	int cnt, i;
185 
186 #ifdef STATS
187 	nphysread++;
188 	physreadsize += size;
189 #endif
190 
191 loop:
192 	if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) {
193 		msg("rawread: lseek fails\n");
194 		goto err;
195 	}
196 	if ((cnt = read(diskfd, buf, size)) == size)
197 		return;
198 	if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) {
199 		/*
200 		 * Trying to read the final fragment.
201 		 *
202 		 * NB - dump only works in TP_BSIZE blocks, hence
203 		 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
204 		 * It should be smarter about not actually trying to
205 		 * read more than it can get, but for the time being
206 		 * we punt and scale back the read only when it gets
207 		 * us into trouble. (mkm 9/25/83)
208 		 */
209 		size -= dev_bsize;
210 		goto loop;
211 	}
212 	if (cnt == -1)
213 		msg("read error from %s: %s: [block %lld]: count=%d\n",
214 		    disk, strerror(errno), (long long)blkno, size);
215 	else
216 		msg("short read error from %s: [block %lld]: "
217 		    "count=%d, got=%d\n",
218 		    disk, (long long)blkno, size, cnt);
219 err:
220 	if (++breaderrors > BREADEMAX) {
221 		msg("More than %d block read errors from %s\n",
222 		    BREADEMAX, disk);
223 		broadcast("DUMP IS AILING!\n");
224 		msg("This is an unrecoverable error.\n");
225 		if (!query("Do you want to attempt to continue?")) {
226 			dumpabort(0);
227 			/*NOTREACHED*/
228 		} else
229 			breaderrors = 0;
230 	}
231 	/*
232 	 * Zero buffer, then try to read each sector of buffer separately.
233 	 */
234 	memset(buf, 0, size);
235 	for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
236 		if (lseek(diskfd, ((off_t)blkno << dev_bshift),
237 		    SEEK_SET) == -1) {
238 			msg("rawread: lseek2 fails: %s!\n",
239 			    strerror(errno));
240 			continue;
241 		}
242 		if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
243 			continue;
244 		if (cnt == -1) {
245 			msg("read error from %s: %s: [sector %lld]: "
246 			    "count=%ld\n", disk, strerror(errno),
247 			    (long long)blkno, dev_bsize);
248 			continue;
249 		}
250 		msg("short read error from %s: [sector %lld]: "
251 		    "count=%ld, got=%d\n",
252 		    disk, (long long)blkno, dev_bsize, cnt);
253 	}
254 }
255 
256 void
257 bread(daddr_t blkno, char *buf, int size)
258 {
259 	int	osize = size, idx;
260 	daddr_t oblkno = blkno;
261 	char   *obuf = buf;
262 	daddr_t numBlocks = howmany(size, dev_bsize);
263 
264 #ifdef STATS
265 	nreads++;
266 	readsize += size;
267 #endif
268 
269 	if (!shareBuffer) {
270 		rawread(blkno, buf, size);
271 		return;
272 	}
273 
274 	if (flock(diskfd, LOCK_EX)) {
275 		msg("flock(LOCK_EX) failed: %s\n",
276 		    strerror(errno));
277 		rawread(blkno, buf, size);
278 		return;
279 	}
280 
281 retry:
282 	idx = 0;
283 	while (size > 0) {
284 		int	i;
285 
286 		for (i = 0; i < cachebufs; i++) {
287 			union cdesc *curr = &cdesc[(i + idx) % cachebufs];
288 
289 #ifdef DIAGNOSTICS
290 			if (curr->cd_owner) {
291 				fprintf(stderr, "Owner is set (%d, me=%d), can"
292 				    "not happen.\n", curr->cd_owner, getpid());
293 			}
294 #endif
295 
296 			if (curr->cd_blkend == 0)
297 				continue;
298 			/*
299 			 * If we find a bit of the read in the buffers,
300 			 * now compute how many blocks we can copy,
301 			 * copy them out, adjust blkno, buf and size,
302 			 * and restart
303 			 */
304 			if (curr->cd_blkstart <= blkno &&
305 			    blkno < curr->cd_blkend) {
306 				/* Number of data blocks to be copied */
307 				int toCopy = MIN(size,
308 				    (curr->cd_blkend - blkno) << dev_bshift);
309 #ifdef DIAGNOSTICS
310 				if (toCopy <= 0 || toCopy > CSIZE) {
311 					fprintf(stderr, "toCopy %d !\n",
312 					    toCopy);
313 					dumpabort(0);
314 				}
315 				if (CDATA(curr) +
316 				    ((blkno - curr->cd_blkstart) <<
317 				    dev_bshift) < CDATA(curr) ||
318 			   	    CDATA(curr) +
319 				    ((blkno - curr->cd_blkstart) <<
320 			   	    dev_bshift) > CDATA(curr) + CSIZE) {
321 					fprintf(stderr, "%p < %p !!!\n",
322 				   	   CDATA(curr) + ((blkno -
323 					   curr->cd_blkstart) << dev_bshift),
324 					   CDATA(curr));
325 					fprintf(stderr,
326 					    "cdesc[i].cd_blkstart %lld "
327 					    "blkno %lld dev_bsize %ld\n",
328 				   	    (long long)curr->cd_blkstart,
329 					    (long long)blkno,
330 					    dev_bsize);
331 					dumpabort(0);
332 				}
333 #endif
334 				memcpy(buf, CDATA(curr) +
335 				    ((blkno - curr->cd_blkstart) <<
336 				    dev_bshift),
337 			   	    toCopy);
338 
339 				buf 	+= toCopy;
340 				size 	-= toCopy;
341 				blkno 	+= howmany(toCopy, dev_bsize);
342 				numBlocks -= howmany(toCopy, dev_bsize);
343 
344 				curr->cd_time = cheader->cd_count++;
345 
346 				/*
347 				 * If all data of a cache block have been
348 				 * read, chances are good no more reads
349 				 * will occur, so expire the cache immediately
350 				 */
351 
352 				curr->cd_blocksRead +=
353 				    howmany(toCopy, dev_bsize);
354 				if (curr->cd_blocksRead >= nblksread)
355 					curr->cd_time = 0;
356 
357 				goto retry;
358 			}
359 		}
360 
361 		/* No more to do? */
362 		if (size == 0)
363 			break;
364 
365 		/*
366 		 * This does actually not happen if fs blocks are not greater
367 		 * than nblksread.
368 		 */
369 		if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
370 			rawread(oblkno, obuf, osize);
371 			break;
372 		} else {
373 			ssize_t	rsize;
374 			daddr_t	blockBlkNo;
375 
376 			blockBlkNo = (blkno / nblksread) * nblksread;
377 			idx = findlru();
378 			rsize = MIN(nblksread,
379 			    ufsib->ufs_dsize - blockBlkNo) << dev_bshift;
380 
381 #ifdef DIAGNOSTICS
382 			if (cdesc[idx].cd_owner)
383 				fprintf(stderr, "Owner is set (%d, me=%d), can"
384 				    "not happen(2).\n", cdesc[idx].cd_owner,
385 				    getpid());
386 			cdesc[idx].cd_owner = getpid();
387 #endif
388 			cdesc[idx].cd_time = cheader->cd_count++;
389 			cdesc[idx].cd_blkstart = blockBlkNo;
390 			cdesc[idx].cd_blkend = 0;
391 			cdesc[idx].cd_blocksRead = 0;
392 
393 			if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift),
394 			    SEEK_SET) == -1) {
395 				msg("readBlocks: lseek fails: %s\n",
396 				    strerror(errno));
397 				rsize = -1;
398 			} else {
399 				rsize = read(diskfd,
400 				    CDATA(&cdesc[idx]), rsize);
401 				if (rsize < 0) {
402 					msg("readBlocks: read fails: %s\n",
403 					    strerror(errno));
404 				}
405 			}
406 
407 			/* On errors, panic, punt, try to read without
408 			 * cache and let raw read routine do the rest.
409 			 */
410 
411 			if (rsize <= 0) {
412 				rawread(oblkno, obuf, osize);
413 #ifdef DIAGNOSTICS
414 				if (cdesc[idx].cd_owner != getpid())
415 					fprintf(stderr, "Owner changed from "
416 					    "%d to %d, can't happen\n",
417 					    getpid(), cdesc[idx].cd_owner);
418 				cdesc[idx].cd_owner = 0;
419 #endif
420 				break;
421 			}
422 
423 			/* On short read, just note the fact and go on */
424 			cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize;
425 
426 #ifdef STATS
427 			nphysread++;
428 			physreadsize += rsize;
429 #endif
430 #ifdef DIAGNOSTICS
431 			if (cdesc[idx].cd_owner != getpid())
432 				fprintf(stderr, "Owner changed from "
433 				    "%d to %d, can't happen\n",
434 				    getpid(), cdesc[idx].cd_owner);
435 			cdesc[idx].cd_owner = 0;
436 #endif
437 			/*
438 			 * We swapped some of data in, let the loop fetch
439 			 * them from cache
440 			 */
441 		}
442 	}
443 
444 	if (flock(diskfd, LOCK_UN))
445 		msg("flock(LOCK_UN) failed: %s\n",
446 		    strerror(errno));
447 }
448 
449 void
450 printcachestats(void)
451 {
452 
453 #ifdef STATS
454 	fprintf(stderr, "Pid %d: %d reads (%u bytes) "
455 	    "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
456 	    getpid(), nreads, (u_int) readsize, nphysread,
457 	    (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
458 	    (int) (((physreadsize - readsize) * 100) / readsize));
459 #endif
460 }
461