xref: /netbsd-src/sbin/dump/rcache.c (revision 9fbd88883c38d0c0fbfcbe66d76fe6b0fab3f9de)
1 /*	$NetBSD: rcache.c,v 1.10 2001/12/23 12:29:56 lukem Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Martin J. Laubach <mjl@emsi.priv.at> and
9  *    Manuel Bouyer <Manuel.Bouyer@lip6.fr>.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 /*-----------------------------------------------------------------------*/
40 #include <sys/types.h>
41 #include <sys/uio.h>
42 #include <sys/mman.h>
43 #include <sys/param.h>
44 #include <sys/sysctl.h>
45 #include <ufs/ufs/dinode.h>
46 
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <unistd.h>
50 #include <fcntl.h>
51 #include <errno.h>
52 #include <string.h>
53 
54 #include "dump.h"
55 
56 /*-----------------------------------------------------------------------*/
57 #define MAXCACHEBUFS	512	/* max 512 buffers */
58 #define MAXMEMPART	6	/* max 15% of the user mem */
59 
60 /*-----------------------------------------------------------------------*/
61 struct cheader {
62 	volatile size_t count;
63 };
64 
65 struct cdesc {
66 	volatile daddr_t blkstart;
67 	volatile daddr_t blkend;/* start + nblksread */
68 	volatile daddr_t blocksRead;
69 	volatile size_t time;
70 #ifdef DIAGNOSTICS
71 	volatile pid_t owner;
72 #endif
73 };
74 
75 static int findlru(void);
76 
77 static void *shareBuffer = NULL;
78 static struct cheader *cheader;
79 static struct cdesc *cdesc;
80 static char *cdata;
81 static int cachebufs;
82 static int nblksread;
83 
84 #ifdef STATS
85 static int nreads;
86 static int nphysread;
87 static int64_t readsize;
88 static int64_t physreadsize;
89 #endif
90 
91 #define CDATA(i)	(cdata + ((i) * nblksread * dev_bsize))
92 
93 void
94 initcache(int cachesize, int readblksize)
95 {
96 	size_t len;
97 	size_t  sharedSize;
98 
99 	nblksread = (readblksize + ufsib->ufs_bsize - 1) / ufsib->ufs_bsize;
100 	if(cachesize == -1) {	/* Compute from memory available */
101 		int usermem;
102 		int mib[2] = { CTL_HW, HW_USERMEM };
103 
104 		len = sizeof(usermem);
105 		if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
106 			msg("sysctl(hw.usermem) failed: %s\n", strerror(errno));
107 			return;
108 		}
109 		cachebufs = (usermem / MAXMEMPART) / (nblksread * dev_bsize);
110 	} else {		/* User specified */
111 		cachebufs = cachesize;
112 	}
113 
114 	if(cachebufs) {	/* Don't allocate if zero --> no caching */
115 		if (cachebufs > MAXCACHEBUFS)
116 			cachebufs = MAXCACHEBUFS;
117 
118 		sharedSize = sizeof(struct cheader) +
119 	   	    sizeof(struct cdesc) * cachebufs +
120 	   	    nblksread * cachebufs * dev_bsize;
121 #ifdef STATS
122 		fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
123 	   	    sharedSize);
124 #endif
125 		shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
126 	   	    MAP_ANON | MAP_SHARED, -1, 0);
127 		if (shareBuffer == (void *)-1) {
128 			msg("can't mmap shared memory for buffer: %s\n",
129 			    strerror(errno));
130 			return;
131 		}
132 		cheader = shareBuffer;
133 		cdesc = (struct cdesc *) (((char *) shareBuffer) +
134 		    sizeof(struct cheader));
135 		cdata = ((char *) shareBuffer) + sizeof(struct cheader) +
136 	   	    sizeof(struct cdesc) * cachebufs;
137 
138 		memset(shareBuffer, '\0', sharedSize);
139 	}
140 }
141 
142 /*
143  * Find the cache buffer descriptor that shows the minimal access time
144  */
145 static int
146 findlru(void)
147 {
148 	int	i;
149 	size_t	minTime = cdesc[0].time;
150 	int	minIdx = 0;
151 
152 	for (i = 0; i < cachebufs; i++) {
153 		if (cdesc[i].time < minTime) {
154 			minIdx = i;
155 			minTime = cdesc[i].time;
156 		}
157 	}
158 
159 	return minIdx;
160 }
161 
162 /*
163  * Read data directly from disk, with smart error handling.
164  * Try to recover from hard errors by reading in sector sized pieces.
165  * Error recovery is attempted at most BREADEMAX times before seeking
166  * consent from the operator to continue.
167  */
168 
169 static int breaderrors = 0;
170 #define BREADEMAX 32
171 
172 void
173 rawread(daddr_t blkno, char *buf, int size)
174 {
175 	int cnt, i;
176 #ifdef STATS
177 	nphysread++;
178 	physreadsize += size;
179 #endif
180 
181  loop:
182 	if (lseek(diskfd, ((off_t) blkno << dev_bshift), 0) < 0) {
183 		msg("rawread: lseek fails\n");
184 		goto err;
185 	}
186 	if ((cnt =  read(diskfd, buf, size)) == size)
187 		return;
188 	if (blkno + (size / dev_bsize) > ufsib->ufs_dsize) {
189 		/*
190 		 * Trying to read the final fragment.
191 		 *
192 		 * NB - dump only works in TP_BSIZE blocks, hence
193 		 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
194 		 * It should be smarter about not actually trying to
195 		 * read more than it can get, but for the time being
196 		 * we punt and scale back the read only when it gets
197 		 * us into trouble. (mkm 9/25/83)
198 		 */
199 		size -= dev_bsize;
200 		goto loop;
201 	}
202 	if (cnt == -1)
203 		msg("read error from %s: %s: [block %d]: count=%d\n",
204 			disk, strerror(errno), blkno, size);
205 	else
206 		msg("short read error from %s: [block %d]: count=%d, got=%d\n",
207 			disk, blkno, size, cnt);
208 err:
209 	if (++breaderrors > BREADEMAX) {
210 		msg("More than %d block read errors from %s\n",
211 			BREADEMAX, disk);
212 		broadcast("DUMP IS AILING!\n");
213 		msg("This is an unrecoverable error.\n");
214 		if (!query("Do you want to attempt to continue?")){
215 			dumpabort(0);
216 			/*NOTREACHED*/
217 		} else
218 			breaderrors = 0;
219 	}
220 	/*
221 	 * Zero buffer, then try to read each sector of buffer separately.
222 	 */
223 	memset(buf, 0, size);
224 	for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
225 		if (lseek(diskfd, ((off_t)blkno << dev_bshift), 0) < 0) {
226 			msg("rawread: lseek2 fails: %s!\n",
227 			    strerror(errno));
228 			continue;
229 		}
230 		if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
231 			continue;
232 		if (cnt == -1) {
233 			msg("read error from %s: %s: [sector %d]: count=%ld: "
234 			    "%s\n", disk, strerror(errno), blkno, dev_bsize,
235 			    strerror(errno));
236 			continue;
237 		}
238 		msg("short read error from %s: [sector %d]: count=%ld, got=%d\n",
239 		    disk, blkno, dev_bsize, cnt);
240 	}
241 }
242 
243 void
244 bread(daddr_t blkno, char *buf, int size)
245 {
246 	int	osize = size;
247 	daddr_t oblkno = blkno;
248 	char   *obuf = buf;
249 	daddr_t numBlocks = (size + dev_bsize -1) / dev_bsize;
250 
251 #ifdef STATS
252 	nreads++;
253 	readsize += size;
254 #endif
255 
256 	if (!shareBuffer) {
257 		rawread(blkno, buf, size);
258 		return;
259 	}
260 
261 	if (flock(diskfd, LOCK_EX)) {
262 		msg("flock(LOCK_EX) failed: %s\n",
263 		    strerror(errno));
264 		rawread(blkno, buf, size);
265 		return;
266 	}
267 
268 retry:
269 	while(size > 0) {
270 		int	i;
271 
272 		for (i = 0; i < cachebufs; i++) {
273 			struct cdesc *curr = &cdesc[i];
274 
275 #ifdef DIAGNOSTICS
276 			if (curr->owner) {
277 				fprintf(stderr, "Owner is set (%d, me=%d), can"
278 				    "not happen.\n", curr->owner, getpid());
279 			}
280 #endif
281 
282 			if (curr->blkend == 0)
283 				continue;
284 			/*
285 			 * If we find a bit of the read in the buffers,
286 			 * now compute how many blocks we can copy,
287 			 * copy them out, adjust blkno, buf and size,
288 			 * and restart
289 			 */
290 			if (curr->blkstart <= blkno &&
291 			    blkno < curr->blkend) {
292 				/* Number of data blocks to be copied */
293 				int toCopy = MIN(size,
294 				    (curr->blkend - blkno) * dev_bsize);
295 #ifdef DIAGNOSTICS
296 				if (toCopy <= 0 ||
297 				    toCopy > nblksread * dev_bsize) {
298 					fprintf(stderr, "toCopy %d !\n",
299 					    toCopy);
300 					dumpabort(0);
301 				}
302 				if (CDATA(i) + (blkno - curr->blkstart) *
303 			   	    dev_bsize < CDATA(i) ||
304 			   	    CDATA(i) + (blkno - curr->blkstart) *
305 			   	    dev_bsize >
306 				    CDATA(i) + nblksread * dev_bsize) {
307 					fprintf(stderr, "%p < %p !!!\n",
308 				   	   CDATA(i) + (blkno -
309 						curr->blkstart) * dev_bsize,
310 					   CDATA(i));
311 					fprintf(stderr, "cdesc[i].blkstart %d "
312 					    "blkno %d dev_bsize %ld\n",
313 				   	    curr->blkstart, blkno, dev_bsize);
314 					dumpabort(0);
315 				}
316 #endif
317 				memcpy(buf, CDATA(i) +
318 				    (blkno - curr->blkstart) * dev_bsize,
319 			   	    toCopy);
320 
321 				buf 	+= toCopy;
322 				size 	-= toCopy;
323 				blkno 	+= (toCopy + dev_bsize - 1) / dev_bsize;
324 				numBlocks -=
325 				    (toCopy  + dev_bsize - 1) / dev_bsize;
326 
327 				curr->time = cheader->count++;
328 
329 				/*
330 				 * If all data of a cache block have been
331 				 * read, chances are good no more reads
332 				 * will occur, so expire the cache immediately
333 				 */
334 
335 				curr->blocksRead +=
336 				    (toCopy + dev_bsize -1) / dev_bsize;
337 				if (curr->blocksRead >= nblksread)
338 					curr->time = 0;
339 
340 				goto retry;
341 			}
342 		}
343 
344 		/* No more to do? */
345 		if (size == 0)
346 			break;
347 
348 		/*
349 		 * This does actually not happen if fs blocks are not greater
350 		 * than nblksread.
351 		 */
352 		if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
353 			rawread(oblkno, obuf, osize);
354 			break;
355 		} else {
356 			int	idx;
357 			ssize_t	rsize;
358 			daddr_t	blockBlkNo;
359 
360 			blockBlkNo = (blkno / nblksread) * nblksread;
361 			idx = findlru();
362 			rsize = MIN(nblksread,
363 			    ufsib->ufs_dsize - blockBlkNo) *
364 			    dev_bsize;
365 
366 #ifdef DIAGNOSTICS
367 			if (cdesc[idx].owner)
368 				fprintf(stderr, "Owner is set (%d, me=%d), can"
369 				    "not happen(2).\n", cdesc[idx].owner,
370 				    getpid());
371 			cdesc[idx].owner = getpid();
372 #endif
373 			cdesc[idx].time = cheader->count++;
374 			cdesc[idx].blkstart = blockBlkNo;
375 			cdesc[idx].blocksRead = 0;
376 
377 			if (lseek(diskfd,
378 			    ((off_t) (blockBlkNo) << dev_bshift), 0) < 0) {
379 				msg("readBlocks: lseek fails: %s\n",
380 				    strerror(errno));
381 				rsize = -1;
382 			} else {
383 				rsize = read(diskfd, CDATA(idx), rsize);
384 				if (rsize < 0) {
385 					msg("readBlocks: read fails: %s\n",
386 					    strerror(errno));
387 				}
388 			}
389 
390 			/* On errors, panic, punt, try to read without
391 			 * cache and let raw read routine do the rest.
392 			 */
393 
394 			if (rsize <= 0) {
395 				rawread(oblkno, obuf, osize);
396 #ifdef DIAGNOSTICS
397 				if (cdesc[idx].owner != getpid())
398 					fprintf(stderr, "Owner changed from "
399 					    "%d to %d, can't happen\n",
400 					    getpid(), cdesc[idx].owner);
401 				cdesc[idx].owner = 0;
402 #endif
403 				break;
404 			}
405 
406 			/* On short read, just note the fact and go on */
407 			cdesc[idx].blkend = blockBlkNo + rsize / dev_bsize;
408 
409 #ifdef STATS
410 			nphysread++;
411 			physreadsize += rsize;
412 #endif
413 #ifdef DIAGNOSTICS
414 			if (cdesc[idx].owner != getpid())
415 				fprintf(stderr, "Owner changed from "
416 				    "%d to %d, can't happen\n",
417 				    getpid(), cdesc[idx].owner);
418 			cdesc[idx].owner = 0;
419 #endif
420 			/*
421 			 * We swapped some of data in, let the loop fetch
422 			 * them from cache
423 			 */
424 		}
425 	}
426 
427 	if (flock(diskfd, LOCK_UN))
428 		msg("flock(LOCK_UN) failed: %s\n",
429 		    strerror(errno));
430 	return;
431 }
432 
433 void
434 printcachestats(void)
435 {
436 #ifdef STATS
437 	fprintf(stderr, "Pid %d: %d reads (%u bytes) "
438 	    "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
439 	    getpid(), nreads, (u_int) readsize, nphysread,
440 	    (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
441 	    (int) (((physreadsize - readsize) * 100) / readsize));
442 #endif
443 }
444