1 /* $NetBSD: rcache.c,v 1.23 2010/01/27 12:20:25 spz Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __RCSID("$NetBSD: rcache.c,v 1.23 2010/01/27 12:20:25 spz Exp $"); 36 #endif /* not lint */ 37 38 #include <sys/types.h> 39 #include <sys/uio.h> 40 #include <sys/mman.h> 41 #include <sys/param.h> 42 #include <sys/sysctl.h> 43 #include <ufs/ufs/dinode.h> 44 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <unistd.h> 48 #include <fcntl.h> 49 #include <errno.h> 50 #include <string.h> 51 52 #include "dump.h" 53 54 /*-----------------------------------------------------------------------*/ 55 #define MAXCACHEBUFS 512 /* max 512 buffers */ 56 #define MAXMEMPART 6 /* max 15% of the user mem */ 57 58 /*-----------------------------------------------------------------------*/ 59 union cdesc { 60 volatile size_t cd_count; 61 struct { 62 volatile daddr_t blkstart; 63 volatile daddr_t blkend; /* start + nblksread */ 64 volatile daddr_t blocksRead; 65 volatile size_t time; 66 #ifdef DIAGNOSTICS 67 volatile pid_t owner; 68 #endif 69 } desc; 70 #define cd_blkstart desc.blkstart 71 #define cd_blkend desc.blkend 72 #define cd_blocksRead desc.blocksRead 73 #define cd_time desc.time 74 #define cd_owner desc.owner 75 }; 76 77 static int findlru(void); 78 79 static void *shareBuffer = NULL; 80 static union cdesc *cheader; 81 static union cdesc *cdesc; 82 static char *cdata; 83 static int cachebufs; 84 static int nblksread; 85 86 #ifdef STATS 87 static int nreads; 88 static int nphysread; 89 static int64_t readsize; 90 static int64_t physreadsize; 91 #endif 92 93 #define CSIZE (nblksread << dev_bshift) /* cache buf size */ 94 #define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 95 96 void 97 initcache(int cachesize, int readblksize) 98 { 99 size_t len; 100 size_t sharedSize; 101 102 /* Convert read block size in terms of filesystem block size */ 103 nblksread = howmany(readblksize, ufsib->ufs_bsize); 104 105 /* Then, convert it in terms of device block size */ 106 nblksread <<= ufsib->ufs_bshift - dev_bshift; 107 108 if (cachesize == -1) { /* Compute from memory available */ 109 uint64_t usermem, cachetmp; 110 int mib[2] = { CTL_HW, HW_USERMEM64 }; 111 112 len = sizeof(usermem); 113 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 114 msg("sysctl(hw.usermem) failed: %s\n", 115 strerror(errno)); 116 return; 117 } 118 cachetmp = (usermem / MAXMEMPART) / CSIZE; 119 /* for those with TB of RAM */ 120 cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp; 121 } else { /* User specified */ 122 cachebufs = cachesize; 123 } 124 125 if (cachebufs) { /* Don't allocate if zero --> no caching */ 126 if (cachebufs > MAXCACHEBUFS) 127 cachebufs = MAXCACHEBUFS; 128 129 sharedSize = sizeof(union cdesc) + 130 sizeof(union cdesc) * cachebufs + 131 cachebufs * CSIZE; 132 #ifdef STATS 133 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 134 sharedSize); 135 #endif 136 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 137 MAP_ANON | MAP_SHARED, -1, 0); 138 if (shareBuffer == MAP_FAILED) { 139 msg("can't mmap shared memory for buffer: %s\n", 140 strerror(errno)); 141 return; 142 } 143 cheader = shareBuffer; 144 cdesc = (union cdesc *) (((char *) shareBuffer) + 145 sizeof(union cdesc)); 146 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 147 sizeof(union cdesc) * cachebufs; 148 149 memset(shareBuffer, '\0', sharedSize); 150 } 151 } 152 153 /* 154 * Find the cache buffer descriptor that shows the minimal access time 155 */ 156 static int 157 findlru(void) 158 { 159 int i; 160 size_t minTime = cdesc[0].cd_time; 161 int minIdx = 0; 162 163 for (i = 0; i < cachebufs; i++) { 164 if (cdesc[i].cd_time < minTime) { 165 minIdx = i; 166 minTime = cdesc[i].cd_time; 167 } 168 } 169 170 return minIdx; 171 } 172 173 /* 174 * Read data directly from disk, with smart error handling. 175 * Try to recover from hard errors by reading in sector sized pieces. 176 * Error recovery is attempted at most BREADEMAX times before seeking 177 * consent from the operator to continue. 178 */ 179 180 static int breaderrors = 0; 181 #define BREADEMAX 32 182 183 void 184 rawread(daddr_t blkno, char *buf, int size) 185 { 186 int cnt, i; 187 188 #ifdef STATS 189 nphysread++; 190 physreadsize += size; 191 #endif 192 193 loop: 194 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) { 195 msg("rawread: lseek fails\n"); 196 goto err; 197 } 198 if ((cnt = read(diskfd, buf, size)) == size) 199 return; 200 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 201 /* 202 * Trying to read the final fragment. 203 * 204 * NB - dump only works in TP_BSIZE blocks, hence 205 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 206 * It should be smarter about not actually trying to 207 * read more than it can get, but for the time being 208 * we punt and scale back the read only when it gets 209 * us into trouble. (mkm 9/25/83) 210 */ 211 size -= dev_bsize; 212 goto loop; 213 } 214 if (cnt == -1) 215 msg("read error from %s: %s: [block %lld]: count=%d\n", 216 disk, strerror(errno), (long long)blkno, size); 217 else 218 msg("short read error from %s: [block %lld]: " 219 "count=%d, got=%d\n", 220 disk, (long long)blkno, size, cnt); 221 err: 222 if (++breaderrors > BREADEMAX) { 223 msg("More than %d block read errors from %s\n", 224 BREADEMAX, disk); 225 broadcast("DUMP IS AILING!\n"); 226 msg("This is an unrecoverable error.\n"); 227 if (!query("Do you want to attempt to continue?")) { 228 dumpabort(0); 229 /*NOTREACHED*/ 230 } else 231 breaderrors = 0; 232 } 233 /* 234 * Zero buffer, then try to read each sector of buffer separately. 235 */ 236 memset(buf, 0, size); 237 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 238 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 239 SEEK_SET) == -1) { 240 msg("rawread: lseek2 fails: %s!\n", 241 strerror(errno)); 242 continue; 243 } 244 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 245 continue; 246 if (cnt == -1) { 247 msg("read error from %s: %s: [sector %lld]: " 248 "count=%ld\n", disk, strerror(errno), 249 (long long)blkno, dev_bsize); 250 continue; 251 } 252 msg("short read error from %s: [sector %lld]: " 253 "count=%ld, got=%d\n", 254 disk, (long long)blkno, dev_bsize, cnt); 255 } 256 } 257 258 void 259 bread(daddr_t blkno, char *buf, int size) 260 { 261 int osize = size, idx; 262 daddr_t oblkno = blkno; 263 char *obuf = buf; 264 daddr_t numBlocks = howmany(size, dev_bsize); 265 266 #ifdef STATS 267 nreads++; 268 readsize += size; 269 #endif 270 271 if (!shareBuffer) { 272 rawread(blkno, buf, size); 273 return; 274 } 275 276 if (flock(diskfd, LOCK_EX)) { 277 msg("flock(LOCK_EX) failed: %s\n", 278 strerror(errno)); 279 rawread(blkno, buf, size); 280 return; 281 } 282 283 retry: 284 idx = 0; 285 while (size > 0) { 286 int i; 287 288 for (i = 0; i < cachebufs; i++) { 289 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 290 291 #ifdef DIAGNOSTICS 292 if (curr->cd_owner) { 293 fprintf(stderr, "Owner is set (%d, me=%d), can" 294 "not happen.\n", curr->cd_owner, getpid()); 295 } 296 #endif 297 298 if (curr->cd_blkend == 0) 299 continue; 300 /* 301 * If we find a bit of the read in the buffers, 302 * now compute how many blocks we can copy, 303 * copy them out, adjust blkno, buf and size, 304 * and restart 305 */ 306 if (curr->cd_blkstart <= blkno && 307 blkno < curr->cd_blkend) { 308 /* Number of data blocks to be copied */ 309 int toCopy = MIN(size, 310 (curr->cd_blkend - blkno) << dev_bshift); 311 #ifdef DIAGNOSTICS 312 if (toCopy <= 0 || toCopy > CSIZE) { 313 fprintf(stderr, "toCopy %d !\n", 314 toCopy); 315 dumpabort(0); 316 } 317 if (CDATA(curr) + 318 ((blkno - curr->cd_blkstart) << 319 dev_bshift) < CDATA(curr) || 320 CDATA(curr) + 321 ((blkno - curr->cd_blkstart) << 322 dev_bshift) > CDATA(curr) + CSIZE) { 323 fprintf(stderr, "%p < %p !!!\n", 324 CDATA(curr) + ((blkno - 325 curr->cd_blkstart) << dev_bshift), 326 CDATA(curr)); 327 fprintf(stderr, 328 "cdesc[i].cd_blkstart %lld " 329 "blkno %lld dev_bsize %ld\n", 330 (long long)curr->cd_blkstart, 331 (long long)blkno, 332 dev_bsize); 333 dumpabort(0); 334 } 335 #endif 336 memcpy(buf, CDATA(curr) + 337 ((blkno - curr->cd_blkstart) << 338 dev_bshift), 339 toCopy); 340 341 buf += toCopy; 342 size -= toCopy; 343 blkno += howmany(toCopy, dev_bsize); 344 numBlocks -= howmany(toCopy, dev_bsize); 345 346 curr->cd_time = cheader->cd_count++; 347 348 /* 349 * If all data of a cache block have been 350 * read, chances are good no more reads 351 * will occur, so expire the cache immediately 352 */ 353 354 curr->cd_blocksRead += 355 howmany(toCopy, dev_bsize); 356 if (curr->cd_blocksRead >= nblksread) 357 curr->cd_time = 0; 358 359 goto retry; 360 } 361 } 362 363 /* No more to do? */ 364 if (size == 0) 365 break; 366 367 /* 368 * This does actually not happen if fs blocks are not greater 369 * than nblksread. 370 */ 371 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 372 rawread(oblkno, obuf, osize); 373 break; 374 } else { 375 ssize_t rsize; 376 daddr_t blockBlkNo; 377 378 blockBlkNo = (blkno / nblksread) * nblksread; 379 idx = findlru(); 380 rsize = MIN(nblksread, 381 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 382 383 #ifdef DIAGNOSTICS 384 if (cdesc[idx].cd_owner) 385 fprintf(stderr, "Owner is set (%d, me=%d), can" 386 "not happen(2).\n", cdesc[idx].cd_owner, 387 getpid()); 388 cdesc[idx].cd_owner = getpid(); 389 #endif 390 cdesc[idx].cd_time = cheader->cd_count++; 391 cdesc[idx].cd_blkstart = blockBlkNo; 392 cdesc[idx].cd_blkend = 0; 393 cdesc[idx].cd_blocksRead = 0; 394 395 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 396 SEEK_SET) == -1) { 397 msg("readBlocks: lseek fails: %s\n", 398 strerror(errno)); 399 rsize = -1; 400 } else { 401 rsize = read(diskfd, 402 CDATA(&cdesc[idx]), rsize); 403 if (rsize < 0) { 404 msg("readBlocks: read fails: %s\n", 405 strerror(errno)); 406 } 407 } 408 409 /* On errors, panic, punt, try to read without 410 * cache and let raw read routine do the rest. 411 */ 412 413 if (rsize <= 0) { 414 rawread(oblkno, obuf, osize); 415 #ifdef DIAGNOSTICS 416 if (cdesc[idx].cd_owner != getpid()) 417 fprintf(stderr, "Owner changed from " 418 "%d to %d, can't happen\n", 419 getpid(), cdesc[idx].cd_owner); 420 cdesc[idx].cd_owner = 0; 421 #endif 422 break; 423 } 424 425 /* On short read, just note the fact and go on */ 426 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 427 428 #ifdef STATS 429 nphysread++; 430 physreadsize += rsize; 431 #endif 432 #ifdef DIAGNOSTICS 433 if (cdesc[idx].cd_owner != getpid()) 434 fprintf(stderr, "Owner changed from " 435 "%d to %d, can't happen\n", 436 getpid(), cdesc[idx].cd_owner); 437 cdesc[idx].cd_owner = 0; 438 #endif 439 /* 440 * We swapped some of data in, let the loop fetch 441 * them from cache 442 */ 443 } 444 } 445 446 if (flock(diskfd, LOCK_UN)) 447 msg("flock(LOCK_UN) failed: %s\n", 448 strerror(errno)); 449 } 450 451 void 452 printcachestats(void) 453 { 454 455 #ifdef STATS 456 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 457 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 458 getpid(), nreads, (u_int) readsize, nphysread, 459 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 460 (int) (((physreadsize - readsize) * 100) / readsize)); 461 #endif 462 } 463