xref: /netbsd-src/sbin/dump/rcache.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: rcache.c,v 1.23 2010/01/27 12:20:25 spz Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Martin J. Laubach <mjl@emsi.priv.at> and
9  *    Manuel Bouyer <Manuel.Bouyer@lip6.fr>.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __RCSID("$NetBSD: rcache.c,v 1.23 2010/01/27 12:20:25 spz Exp $");
36 #endif /* not lint */
37 
38 #include <sys/types.h>
39 #include <sys/uio.h>
40 #include <sys/mman.h>
41 #include <sys/param.h>
42 #include <sys/sysctl.h>
43 #include <ufs/ufs/dinode.h>
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <unistd.h>
48 #include <fcntl.h>
49 #include <errno.h>
50 #include <string.h>
51 
52 #include "dump.h"
53 
54 /*-----------------------------------------------------------------------*/
55 #define MAXCACHEBUFS	512	/* max 512 buffers */
56 #define MAXMEMPART	6	/* max 15% of the user mem */
57 
58 /*-----------------------------------------------------------------------*/
59 union cdesc {
60 	volatile size_t cd_count;
61 	struct {
62 		volatile daddr_t blkstart;
63 		volatile daddr_t blkend;	/* start + nblksread */
64 		volatile daddr_t blocksRead;
65 		volatile size_t time;
66 #ifdef DIAGNOSTICS
67 		volatile pid_t owner;
68 #endif
69 	} desc;
70 #define cd_blkstart	desc.blkstart
71 #define cd_blkend	desc.blkend
72 #define cd_blocksRead	desc.blocksRead
73 #define cd_time		desc.time
74 #define cd_owner	desc.owner
75 };
76 
77 static int findlru(void);
78 
79 static void *shareBuffer = NULL;
80 static union cdesc *cheader;
81 static union cdesc *cdesc;
82 static char *cdata;
83 static int cachebufs;
84 static int nblksread;
85 
86 #ifdef STATS
87 static int nreads;
88 static int nphysread;
89 static int64_t readsize;
90 static int64_t physreadsize;
91 #endif
92 
93 #define	CSIZE		(nblksread << dev_bshift)	/* cache buf size */
94 #define	CDATA(desc)	(cdata + ((desc) - cdesc) * CSIZE)
95 
96 void
97 initcache(int cachesize, int readblksize)
98 {
99 	size_t len;
100 	size_t sharedSize;
101 
102 	/* Convert read block size in terms of filesystem block size */
103 	nblksread = howmany(readblksize, ufsib->ufs_bsize);
104 
105 	/* Then, convert it in terms of device block size */
106 	nblksread <<= ufsib->ufs_bshift - dev_bshift;
107 
108 	if (cachesize == -1) {	/* Compute from memory available */
109 		uint64_t usermem, cachetmp;
110 		int mib[2] = { CTL_HW, HW_USERMEM64 };
111 
112 		len = sizeof(usermem);
113 		if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
114 			msg("sysctl(hw.usermem) failed: %s\n",
115 			    strerror(errno));
116 			return;
117 		}
118 		cachetmp = (usermem / MAXMEMPART) / CSIZE;
119 		/* for those with TB of RAM */
120 		cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp;
121 	} else {		/* User specified */
122 		cachebufs = cachesize;
123 	}
124 
125 	if (cachebufs) {	/* Don't allocate if zero --> no caching */
126 		if (cachebufs > MAXCACHEBUFS)
127 			cachebufs = MAXCACHEBUFS;
128 
129 		sharedSize = sizeof(union cdesc) +
130 	   	    sizeof(union cdesc) * cachebufs +
131 	   	    cachebufs * CSIZE;
132 #ifdef STATS
133 		fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
134 	   	    sharedSize);
135 #endif
136 		shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
137 	   	    MAP_ANON | MAP_SHARED, -1, 0);
138 		if (shareBuffer == MAP_FAILED) {
139 			msg("can't mmap shared memory for buffer: %s\n",
140 			    strerror(errno));
141 			return;
142 		}
143 		cheader = shareBuffer;
144 		cdesc = (union cdesc *) (((char *) shareBuffer) +
145 		    sizeof(union cdesc));
146 		cdata = ((char *) shareBuffer) + sizeof(union cdesc) +
147 	   	    sizeof(union cdesc) * cachebufs;
148 
149 		memset(shareBuffer, '\0', sharedSize);
150 	}
151 }
152 
153 /*
154  * Find the cache buffer descriptor that shows the minimal access time
155  */
156 static int
157 findlru(void)
158 {
159 	int	i;
160 	size_t	minTime = cdesc[0].cd_time;
161 	int	minIdx = 0;
162 
163 	for (i = 0; i < cachebufs; i++) {
164 		if (cdesc[i].cd_time < minTime) {
165 			minIdx = i;
166 			minTime = cdesc[i].cd_time;
167 		}
168 	}
169 
170 	return minIdx;
171 }
172 
173 /*
174  * Read data directly from disk, with smart error handling.
175  * Try to recover from hard errors by reading in sector sized pieces.
176  * Error recovery is attempted at most BREADEMAX times before seeking
177  * consent from the operator to continue.
178  */
179 
180 static int breaderrors = 0;
181 #define BREADEMAX 32
182 
183 void
184 rawread(daddr_t blkno, char *buf, int size)
185 {
186 	int cnt, i;
187 
188 #ifdef STATS
189 	nphysread++;
190 	physreadsize += size;
191 #endif
192 
193 loop:
194 	if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) {
195 		msg("rawread: lseek fails\n");
196 		goto err;
197 	}
198 	if ((cnt = read(diskfd, buf, size)) == size)
199 		return;
200 	if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) {
201 		/*
202 		 * Trying to read the final fragment.
203 		 *
204 		 * NB - dump only works in TP_BSIZE blocks, hence
205 		 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
206 		 * It should be smarter about not actually trying to
207 		 * read more than it can get, but for the time being
208 		 * we punt and scale back the read only when it gets
209 		 * us into trouble. (mkm 9/25/83)
210 		 */
211 		size -= dev_bsize;
212 		goto loop;
213 	}
214 	if (cnt == -1)
215 		msg("read error from %s: %s: [block %lld]: count=%d\n",
216 		    disk, strerror(errno), (long long)blkno, size);
217 	else
218 		msg("short read error from %s: [block %lld]: "
219 		    "count=%d, got=%d\n",
220 		    disk, (long long)blkno, size, cnt);
221 err:
222 	if (++breaderrors > BREADEMAX) {
223 		msg("More than %d block read errors from %s\n",
224 		    BREADEMAX, disk);
225 		broadcast("DUMP IS AILING!\n");
226 		msg("This is an unrecoverable error.\n");
227 		if (!query("Do you want to attempt to continue?")) {
228 			dumpabort(0);
229 			/*NOTREACHED*/
230 		} else
231 			breaderrors = 0;
232 	}
233 	/*
234 	 * Zero buffer, then try to read each sector of buffer separately.
235 	 */
236 	memset(buf, 0, size);
237 	for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
238 		if (lseek(diskfd, ((off_t)blkno << dev_bshift),
239 		    SEEK_SET) == -1) {
240 			msg("rawread: lseek2 fails: %s!\n",
241 			    strerror(errno));
242 			continue;
243 		}
244 		if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
245 			continue;
246 		if (cnt == -1) {
247 			msg("read error from %s: %s: [sector %lld]: "
248 			    "count=%ld\n", disk, strerror(errno),
249 			    (long long)blkno, dev_bsize);
250 			continue;
251 		}
252 		msg("short read error from %s: [sector %lld]: "
253 		    "count=%ld, got=%d\n",
254 		    disk, (long long)blkno, dev_bsize, cnt);
255 	}
256 }
257 
258 void
259 bread(daddr_t blkno, char *buf, int size)
260 {
261 	int	osize = size, idx;
262 	daddr_t oblkno = blkno;
263 	char   *obuf = buf;
264 	daddr_t numBlocks = howmany(size, dev_bsize);
265 
266 #ifdef STATS
267 	nreads++;
268 	readsize += size;
269 #endif
270 
271 	if (!shareBuffer) {
272 		rawread(blkno, buf, size);
273 		return;
274 	}
275 
276 	if (flock(diskfd, LOCK_EX)) {
277 		msg("flock(LOCK_EX) failed: %s\n",
278 		    strerror(errno));
279 		rawread(blkno, buf, size);
280 		return;
281 	}
282 
283 retry:
284 	idx = 0;
285 	while (size > 0) {
286 		int	i;
287 
288 		for (i = 0; i < cachebufs; i++) {
289 			union cdesc *curr = &cdesc[(i + idx) % cachebufs];
290 
291 #ifdef DIAGNOSTICS
292 			if (curr->cd_owner) {
293 				fprintf(stderr, "Owner is set (%d, me=%d), can"
294 				    "not happen.\n", curr->cd_owner, getpid());
295 			}
296 #endif
297 
298 			if (curr->cd_blkend == 0)
299 				continue;
300 			/*
301 			 * If we find a bit of the read in the buffers,
302 			 * now compute how many blocks we can copy,
303 			 * copy them out, adjust blkno, buf and size,
304 			 * and restart
305 			 */
306 			if (curr->cd_blkstart <= blkno &&
307 			    blkno < curr->cd_blkend) {
308 				/* Number of data blocks to be copied */
309 				int toCopy = MIN(size,
310 				    (curr->cd_blkend - blkno) << dev_bshift);
311 #ifdef DIAGNOSTICS
312 				if (toCopy <= 0 || toCopy > CSIZE) {
313 					fprintf(stderr, "toCopy %d !\n",
314 					    toCopy);
315 					dumpabort(0);
316 				}
317 				if (CDATA(curr) +
318 				    ((blkno - curr->cd_blkstart) <<
319 				    dev_bshift) < CDATA(curr) ||
320 			   	    CDATA(curr) +
321 				    ((blkno - curr->cd_blkstart) <<
322 			   	    dev_bshift) > CDATA(curr) + CSIZE) {
323 					fprintf(stderr, "%p < %p !!!\n",
324 				   	   CDATA(curr) + ((blkno -
325 					   curr->cd_blkstart) << dev_bshift),
326 					   CDATA(curr));
327 					fprintf(stderr,
328 					    "cdesc[i].cd_blkstart %lld "
329 					    "blkno %lld dev_bsize %ld\n",
330 				   	    (long long)curr->cd_blkstart,
331 					    (long long)blkno,
332 					    dev_bsize);
333 					dumpabort(0);
334 				}
335 #endif
336 				memcpy(buf, CDATA(curr) +
337 				    ((blkno - curr->cd_blkstart) <<
338 				    dev_bshift),
339 			   	    toCopy);
340 
341 				buf 	+= toCopy;
342 				size 	-= toCopy;
343 				blkno 	+= howmany(toCopy, dev_bsize);
344 				numBlocks -= howmany(toCopy, dev_bsize);
345 
346 				curr->cd_time = cheader->cd_count++;
347 
348 				/*
349 				 * If all data of a cache block have been
350 				 * read, chances are good no more reads
351 				 * will occur, so expire the cache immediately
352 				 */
353 
354 				curr->cd_blocksRead +=
355 				    howmany(toCopy, dev_bsize);
356 				if (curr->cd_blocksRead >= nblksread)
357 					curr->cd_time = 0;
358 
359 				goto retry;
360 			}
361 		}
362 
363 		/* No more to do? */
364 		if (size == 0)
365 			break;
366 
367 		/*
368 		 * This does actually not happen if fs blocks are not greater
369 		 * than nblksread.
370 		 */
371 		if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
372 			rawread(oblkno, obuf, osize);
373 			break;
374 		} else {
375 			ssize_t	rsize;
376 			daddr_t	blockBlkNo;
377 
378 			blockBlkNo = (blkno / nblksread) * nblksread;
379 			idx = findlru();
380 			rsize = MIN(nblksread,
381 			    ufsib->ufs_dsize - blockBlkNo) << dev_bshift;
382 
383 #ifdef DIAGNOSTICS
384 			if (cdesc[idx].cd_owner)
385 				fprintf(stderr, "Owner is set (%d, me=%d), can"
386 				    "not happen(2).\n", cdesc[idx].cd_owner,
387 				    getpid());
388 			cdesc[idx].cd_owner = getpid();
389 #endif
390 			cdesc[idx].cd_time = cheader->cd_count++;
391 			cdesc[idx].cd_blkstart = blockBlkNo;
392 			cdesc[idx].cd_blkend = 0;
393 			cdesc[idx].cd_blocksRead = 0;
394 
395 			if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift),
396 			    SEEK_SET) == -1) {
397 				msg("readBlocks: lseek fails: %s\n",
398 				    strerror(errno));
399 				rsize = -1;
400 			} else {
401 				rsize = read(diskfd,
402 				    CDATA(&cdesc[idx]), rsize);
403 				if (rsize < 0) {
404 					msg("readBlocks: read fails: %s\n",
405 					    strerror(errno));
406 				}
407 			}
408 
409 			/* On errors, panic, punt, try to read without
410 			 * cache and let raw read routine do the rest.
411 			 */
412 
413 			if (rsize <= 0) {
414 				rawread(oblkno, obuf, osize);
415 #ifdef DIAGNOSTICS
416 				if (cdesc[idx].cd_owner != getpid())
417 					fprintf(stderr, "Owner changed from "
418 					    "%d to %d, can't happen\n",
419 					    getpid(), cdesc[idx].cd_owner);
420 				cdesc[idx].cd_owner = 0;
421 #endif
422 				break;
423 			}
424 
425 			/* On short read, just note the fact and go on */
426 			cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize;
427 
428 #ifdef STATS
429 			nphysread++;
430 			physreadsize += rsize;
431 #endif
432 #ifdef DIAGNOSTICS
433 			if (cdesc[idx].cd_owner != getpid())
434 				fprintf(stderr, "Owner changed from "
435 				    "%d to %d, can't happen\n",
436 				    getpid(), cdesc[idx].cd_owner);
437 			cdesc[idx].cd_owner = 0;
438 #endif
439 			/*
440 			 * We swapped some of data in, let the loop fetch
441 			 * them from cache
442 			 */
443 		}
444 	}
445 
446 	if (flock(diskfd, LOCK_UN))
447 		msg("flock(LOCK_UN) failed: %s\n",
448 		    strerror(errno));
449 }
450 
451 void
452 printcachestats(void)
453 {
454 
455 #ifdef STATS
456 	fprintf(stderr, "Pid %d: %d reads (%u bytes) "
457 	    "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
458 	    getpid(), nreads, (u_int) readsize, nphysread,
459 	    (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
460 	    (int) (((physreadsize - readsize) * 100) / readsize));
461 #endif
462 }
463