1 /* $NetBSD: rcache.c,v 1.10 2001/12/23 12:29:56 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 /*-----------------------------------------------------------------------*/ 40 #include <sys/types.h> 41 #include <sys/uio.h> 42 #include <sys/mman.h> 43 #include <sys/param.h> 44 #include <sys/sysctl.h> 45 #include <ufs/ufs/dinode.h> 46 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <unistd.h> 50 #include <fcntl.h> 51 #include <errno.h> 52 #include <string.h> 53 54 #include "dump.h" 55 56 /*-----------------------------------------------------------------------*/ 57 #define MAXCACHEBUFS 512 /* max 512 buffers */ 58 #define MAXMEMPART 6 /* max 15% of the user mem */ 59 60 /*-----------------------------------------------------------------------*/ 61 struct cheader { 62 volatile size_t count; 63 }; 64 65 struct cdesc { 66 volatile daddr_t blkstart; 67 volatile daddr_t blkend;/* start + nblksread */ 68 volatile daddr_t blocksRead; 69 volatile size_t time; 70 #ifdef DIAGNOSTICS 71 volatile pid_t owner; 72 #endif 73 }; 74 75 static int findlru(void); 76 77 static void *shareBuffer = NULL; 78 static struct cheader *cheader; 79 static struct cdesc *cdesc; 80 static char *cdata; 81 static int cachebufs; 82 static int nblksread; 83 84 #ifdef STATS 85 static int nreads; 86 static int nphysread; 87 static int64_t readsize; 88 static int64_t physreadsize; 89 #endif 90 91 #define CDATA(i) (cdata + ((i) * nblksread * dev_bsize)) 92 93 void 94 initcache(int cachesize, int readblksize) 95 { 96 size_t len; 97 size_t sharedSize; 98 99 nblksread = (readblksize + ufsib->ufs_bsize - 1) / ufsib->ufs_bsize; 100 if(cachesize == -1) { /* Compute from memory available */ 101 int usermem; 102 int mib[2] = { CTL_HW, HW_USERMEM }; 103 104 len = sizeof(usermem); 105 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 106 msg("sysctl(hw.usermem) failed: %s\n", strerror(errno)); 107 return; 108 } 109 cachebufs = (usermem / MAXMEMPART) / (nblksread * dev_bsize); 110 } else { /* User specified */ 111 cachebufs = cachesize; 112 } 113 114 if(cachebufs) { /* Don't allocate if zero --> no caching */ 115 if (cachebufs > MAXCACHEBUFS) 116 cachebufs = MAXCACHEBUFS; 117 118 sharedSize = sizeof(struct cheader) + 119 sizeof(struct cdesc) * cachebufs + 120 nblksread * cachebufs * dev_bsize; 121 #ifdef STATS 122 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 123 sharedSize); 124 #endif 125 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 126 MAP_ANON | MAP_SHARED, -1, 0); 127 if (shareBuffer == (void *)-1) { 128 msg("can't mmap shared memory for buffer: %s\n", 129 strerror(errno)); 130 return; 131 } 132 cheader = shareBuffer; 133 cdesc = (struct cdesc *) (((char *) shareBuffer) + 134 sizeof(struct cheader)); 135 cdata = ((char *) shareBuffer) + sizeof(struct cheader) + 136 sizeof(struct cdesc) * cachebufs; 137 138 memset(shareBuffer, '\0', sharedSize); 139 } 140 } 141 142 /* 143 * Find the cache buffer descriptor that shows the minimal access time 144 */ 145 static int 146 findlru(void) 147 { 148 int i; 149 size_t minTime = cdesc[0].time; 150 int minIdx = 0; 151 152 for (i = 0; i < cachebufs; i++) { 153 if (cdesc[i].time < minTime) { 154 minIdx = i; 155 minTime = cdesc[i].time; 156 } 157 } 158 159 return minIdx; 160 } 161 162 /* 163 * Read data directly from disk, with smart error handling. 164 * Try to recover from hard errors by reading in sector sized pieces. 165 * Error recovery is attempted at most BREADEMAX times before seeking 166 * consent from the operator to continue. 167 */ 168 169 static int breaderrors = 0; 170 #define BREADEMAX 32 171 172 void 173 rawread(daddr_t blkno, char *buf, int size) 174 { 175 int cnt, i; 176 #ifdef STATS 177 nphysread++; 178 physreadsize += size; 179 #endif 180 181 loop: 182 if (lseek(diskfd, ((off_t) blkno << dev_bshift), 0) < 0) { 183 msg("rawread: lseek fails\n"); 184 goto err; 185 } 186 if ((cnt = read(diskfd, buf, size)) == size) 187 return; 188 if (blkno + (size / dev_bsize) > ufsib->ufs_dsize) { 189 /* 190 * Trying to read the final fragment. 191 * 192 * NB - dump only works in TP_BSIZE blocks, hence 193 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 194 * It should be smarter about not actually trying to 195 * read more than it can get, but for the time being 196 * we punt and scale back the read only when it gets 197 * us into trouble. (mkm 9/25/83) 198 */ 199 size -= dev_bsize; 200 goto loop; 201 } 202 if (cnt == -1) 203 msg("read error from %s: %s: [block %d]: count=%d\n", 204 disk, strerror(errno), blkno, size); 205 else 206 msg("short read error from %s: [block %d]: count=%d, got=%d\n", 207 disk, blkno, size, cnt); 208 err: 209 if (++breaderrors > BREADEMAX) { 210 msg("More than %d block read errors from %s\n", 211 BREADEMAX, disk); 212 broadcast("DUMP IS AILING!\n"); 213 msg("This is an unrecoverable error.\n"); 214 if (!query("Do you want to attempt to continue?")){ 215 dumpabort(0); 216 /*NOTREACHED*/ 217 } else 218 breaderrors = 0; 219 } 220 /* 221 * Zero buffer, then try to read each sector of buffer separately. 222 */ 223 memset(buf, 0, size); 224 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 225 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 0) < 0) { 226 msg("rawread: lseek2 fails: %s!\n", 227 strerror(errno)); 228 continue; 229 } 230 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 231 continue; 232 if (cnt == -1) { 233 msg("read error from %s: %s: [sector %d]: count=%ld: " 234 "%s\n", disk, strerror(errno), blkno, dev_bsize, 235 strerror(errno)); 236 continue; 237 } 238 msg("short read error from %s: [sector %d]: count=%ld, got=%d\n", 239 disk, blkno, dev_bsize, cnt); 240 } 241 } 242 243 void 244 bread(daddr_t blkno, char *buf, int size) 245 { 246 int osize = size; 247 daddr_t oblkno = blkno; 248 char *obuf = buf; 249 daddr_t numBlocks = (size + dev_bsize -1) / dev_bsize; 250 251 #ifdef STATS 252 nreads++; 253 readsize += size; 254 #endif 255 256 if (!shareBuffer) { 257 rawread(blkno, buf, size); 258 return; 259 } 260 261 if (flock(diskfd, LOCK_EX)) { 262 msg("flock(LOCK_EX) failed: %s\n", 263 strerror(errno)); 264 rawread(blkno, buf, size); 265 return; 266 } 267 268 retry: 269 while(size > 0) { 270 int i; 271 272 for (i = 0; i < cachebufs; i++) { 273 struct cdesc *curr = &cdesc[i]; 274 275 #ifdef DIAGNOSTICS 276 if (curr->owner) { 277 fprintf(stderr, "Owner is set (%d, me=%d), can" 278 "not happen.\n", curr->owner, getpid()); 279 } 280 #endif 281 282 if (curr->blkend == 0) 283 continue; 284 /* 285 * If we find a bit of the read in the buffers, 286 * now compute how many blocks we can copy, 287 * copy them out, adjust blkno, buf and size, 288 * and restart 289 */ 290 if (curr->blkstart <= blkno && 291 blkno < curr->blkend) { 292 /* Number of data blocks to be copied */ 293 int toCopy = MIN(size, 294 (curr->blkend - blkno) * dev_bsize); 295 #ifdef DIAGNOSTICS 296 if (toCopy <= 0 || 297 toCopy > nblksread * dev_bsize) { 298 fprintf(stderr, "toCopy %d !\n", 299 toCopy); 300 dumpabort(0); 301 } 302 if (CDATA(i) + (blkno - curr->blkstart) * 303 dev_bsize < CDATA(i) || 304 CDATA(i) + (blkno - curr->blkstart) * 305 dev_bsize > 306 CDATA(i) + nblksread * dev_bsize) { 307 fprintf(stderr, "%p < %p !!!\n", 308 CDATA(i) + (blkno - 309 curr->blkstart) * dev_bsize, 310 CDATA(i)); 311 fprintf(stderr, "cdesc[i].blkstart %d " 312 "blkno %d dev_bsize %ld\n", 313 curr->blkstart, blkno, dev_bsize); 314 dumpabort(0); 315 } 316 #endif 317 memcpy(buf, CDATA(i) + 318 (blkno - curr->blkstart) * dev_bsize, 319 toCopy); 320 321 buf += toCopy; 322 size -= toCopy; 323 blkno += (toCopy + dev_bsize - 1) / dev_bsize; 324 numBlocks -= 325 (toCopy + dev_bsize - 1) / dev_bsize; 326 327 curr->time = cheader->count++; 328 329 /* 330 * If all data of a cache block have been 331 * read, chances are good no more reads 332 * will occur, so expire the cache immediately 333 */ 334 335 curr->blocksRead += 336 (toCopy + dev_bsize -1) / dev_bsize; 337 if (curr->blocksRead >= nblksread) 338 curr->time = 0; 339 340 goto retry; 341 } 342 } 343 344 /* No more to do? */ 345 if (size == 0) 346 break; 347 348 /* 349 * This does actually not happen if fs blocks are not greater 350 * than nblksread. 351 */ 352 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 353 rawread(oblkno, obuf, osize); 354 break; 355 } else { 356 int idx; 357 ssize_t rsize; 358 daddr_t blockBlkNo; 359 360 blockBlkNo = (blkno / nblksread) * nblksread; 361 idx = findlru(); 362 rsize = MIN(nblksread, 363 ufsib->ufs_dsize - blockBlkNo) * 364 dev_bsize; 365 366 #ifdef DIAGNOSTICS 367 if (cdesc[idx].owner) 368 fprintf(stderr, "Owner is set (%d, me=%d), can" 369 "not happen(2).\n", cdesc[idx].owner, 370 getpid()); 371 cdesc[idx].owner = getpid(); 372 #endif 373 cdesc[idx].time = cheader->count++; 374 cdesc[idx].blkstart = blockBlkNo; 375 cdesc[idx].blocksRead = 0; 376 377 if (lseek(diskfd, 378 ((off_t) (blockBlkNo) << dev_bshift), 0) < 0) { 379 msg("readBlocks: lseek fails: %s\n", 380 strerror(errno)); 381 rsize = -1; 382 } else { 383 rsize = read(diskfd, CDATA(idx), rsize); 384 if (rsize < 0) { 385 msg("readBlocks: read fails: %s\n", 386 strerror(errno)); 387 } 388 } 389 390 /* On errors, panic, punt, try to read without 391 * cache and let raw read routine do the rest. 392 */ 393 394 if (rsize <= 0) { 395 rawread(oblkno, obuf, osize); 396 #ifdef DIAGNOSTICS 397 if (cdesc[idx].owner != getpid()) 398 fprintf(stderr, "Owner changed from " 399 "%d to %d, can't happen\n", 400 getpid(), cdesc[idx].owner); 401 cdesc[idx].owner = 0; 402 #endif 403 break; 404 } 405 406 /* On short read, just note the fact and go on */ 407 cdesc[idx].blkend = blockBlkNo + rsize / dev_bsize; 408 409 #ifdef STATS 410 nphysread++; 411 physreadsize += rsize; 412 #endif 413 #ifdef DIAGNOSTICS 414 if (cdesc[idx].owner != getpid()) 415 fprintf(stderr, "Owner changed from " 416 "%d to %d, can't happen\n", 417 getpid(), cdesc[idx].owner); 418 cdesc[idx].owner = 0; 419 #endif 420 /* 421 * We swapped some of data in, let the loop fetch 422 * them from cache 423 */ 424 } 425 } 426 427 if (flock(diskfd, LOCK_UN)) 428 msg("flock(LOCK_UN) failed: %s\n", 429 strerror(errno)); 430 return; 431 } 432 433 void 434 printcachestats(void) 435 { 436 #ifdef STATS 437 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 438 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 439 getpid(), nreads, (u_int) readsize, nphysread, 440 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 441 (int) (((physreadsize - readsize) * 100) / readsize)); 442 #endif 443 } 444