1 /* $NetBSD: rcache.c,v 1.24 2013/06/15 01:27:19 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __RCSID("$NetBSD: rcache.c,v 1.24 2013/06/15 01:27:19 christos Exp $"); 36 #endif /* not lint */ 37 38 #include <sys/types.h> 39 #include <sys/uio.h> 40 #include <sys/mman.h> 41 #include <sys/param.h> 42 #include <sys/sysctl.h> 43 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <unistd.h> 47 #include <fcntl.h> 48 #include <errno.h> 49 #include <string.h> 50 51 #include "dump.h" 52 53 /*-----------------------------------------------------------------------*/ 54 #define MAXCACHEBUFS 512 /* max 512 buffers */ 55 #define MAXMEMPART 6 /* max 15% of the user mem */ 56 57 /*-----------------------------------------------------------------------*/ 58 union cdesc { 59 volatile size_t cd_count; 60 struct { 61 volatile daddr_t blkstart; 62 volatile daddr_t blkend; /* start + nblksread */ 63 volatile daddr_t blocksRead; 64 volatile size_t time; 65 #ifdef DIAGNOSTICS 66 volatile pid_t owner; 67 #endif 68 } desc; 69 #define cd_blkstart desc.blkstart 70 #define cd_blkend desc.blkend 71 #define cd_blocksRead desc.blocksRead 72 #define cd_time desc.time 73 #define cd_owner desc.owner 74 }; 75 76 static int findlru(void); 77 78 static void *shareBuffer = NULL; 79 static union cdesc *cheader; 80 static union cdesc *cdesc; 81 static char *cdata; 82 static int cachebufs; 83 static int nblksread; 84 85 #ifdef STATS 86 static int nreads; 87 static int nphysread; 88 static int64_t readsize; 89 static int64_t physreadsize; 90 #endif 91 92 #define CSIZE (nblksread << dev_bshift) /* cache buf size */ 93 #define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 94 95 void 96 initcache(int cachesize, int readblksize) 97 { 98 size_t len; 99 size_t sharedSize; 100 101 /* Convert read block size in terms of filesystem block size */ 102 nblksread = howmany(readblksize, ufsib->ufs_bsize); 103 104 /* Then, convert it in terms of device block size */ 105 nblksread <<= ufsib->ufs_bshift - dev_bshift; 106 107 if (cachesize == -1) { /* Compute from memory available */ 108 uint64_t usermem, cachetmp; 109 int mib[2] = { CTL_HW, HW_USERMEM64 }; 110 111 len = sizeof(usermem); 112 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 113 msg("sysctl(hw.usermem) failed: %s\n", 114 strerror(errno)); 115 return; 116 } 117 cachetmp = (usermem / MAXMEMPART) / CSIZE; 118 /* for those with TB of RAM */ 119 cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp; 120 } else { /* User specified */ 121 cachebufs = cachesize; 122 } 123 124 if (cachebufs) { /* Don't allocate if zero --> no caching */ 125 if (cachebufs > MAXCACHEBUFS) 126 cachebufs = MAXCACHEBUFS; 127 128 sharedSize = sizeof(union cdesc) + 129 sizeof(union cdesc) * cachebufs + 130 cachebufs * CSIZE; 131 #ifdef STATS 132 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 133 sharedSize); 134 #endif 135 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 136 MAP_ANON | MAP_SHARED, -1, 0); 137 if (shareBuffer == MAP_FAILED) { 138 msg("can't mmap shared memory for buffer: %s\n", 139 strerror(errno)); 140 return; 141 } 142 cheader = shareBuffer; 143 cdesc = (union cdesc *) (((char *) shareBuffer) + 144 sizeof(union cdesc)); 145 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 146 sizeof(union cdesc) * cachebufs; 147 148 memset(shareBuffer, '\0', sharedSize); 149 } 150 } 151 152 /* 153 * Find the cache buffer descriptor that shows the minimal access time 154 */ 155 static int 156 findlru(void) 157 { 158 int i; 159 size_t minTime = cdesc[0].cd_time; 160 int minIdx = 0; 161 162 for (i = 0; i < cachebufs; i++) { 163 if (cdesc[i].cd_time < minTime) { 164 minIdx = i; 165 minTime = cdesc[i].cd_time; 166 } 167 } 168 169 return minIdx; 170 } 171 172 /* 173 * Read data directly from disk, with smart error handling. 174 * Try to recover from hard errors by reading in sector sized pieces. 175 * Error recovery is attempted at most BREADEMAX times before seeking 176 * consent from the operator to continue. 177 */ 178 179 static int breaderrors = 0; 180 #define BREADEMAX 32 181 182 void 183 rawread(daddr_t blkno, char *buf, int size) 184 { 185 int cnt, i; 186 187 #ifdef STATS 188 nphysread++; 189 physreadsize += size; 190 #endif 191 192 loop: 193 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) { 194 msg("rawread: lseek fails\n"); 195 goto err; 196 } 197 if ((cnt = read(diskfd, buf, size)) == size) 198 return; 199 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 200 /* 201 * Trying to read the final fragment. 202 * 203 * NB - dump only works in TP_BSIZE blocks, hence 204 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 205 * It should be smarter about not actually trying to 206 * read more than it can get, but for the time being 207 * we punt and scale back the read only when it gets 208 * us into trouble. (mkm 9/25/83) 209 */ 210 size -= dev_bsize; 211 goto loop; 212 } 213 if (cnt == -1) 214 msg("read error from %s: %s: [block %lld]: count=%d\n", 215 disk, strerror(errno), (long long)blkno, size); 216 else 217 msg("short read error from %s: [block %lld]: " 218 "count=%d, got=%d\n", 219 disk, (long long)blkno, size, cnt); 220 err: 221 if (++breaderrors > BREADEMAX) { 222 msg("More than %d block read errors from %s\n", 223 BREADEMAX, disk); 224 broadcast("DUMP IS AILING!\n"); 225 msg("This is an unrecoverable error.\n"); 226 if (!query("Do you want to attempt to continue?")) { 227 dumpabort(0); 228 /*NOTREACHED*/ 229 } else 230 breaderrors = 0; 231 } 232 /* 233 * Zero buffer, then try to read each sector of buffer separately. 234 */ 235 memset(buf, 0, size); 236 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 237 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 238 SEEK_SET) == -1) { 239 msg("rawread: lseek2 fails: %s!\n", 240 strerror(errno)); 241 continue; 242 } 243 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 244 continue; 245 if (cnt == -1) { 246 msg("read error from %s: %s: [sector %lld]: " 247 "count=%ld\n", disk, strerror(errno), 248 (long long)blkno, dev_bsize); 249 continue; 250 } 251 msg("short read error from %s: [sector %lld]: " 252 "count=%ld, got=%d\n", 253 disk, (long long)blkno, dev_bsize, cnt); 254 } 255 } 256 257 void 258 bread(daddr_t blkno, char *buf, int size) 259 { 260 int osize = size, idx; 261 daddr_t oblkno = blkno; 262 char *obuf = buf; 263 daddr_t numBlocks = howmany(size, dev_bsize); 264 265 #ifdef STATS 266 nreads++; 267 readsize += size; 268 #endif 269 270 if (!shareBuffer) { 271 rawread(blkno, buf, size); 272 return; 273 } 274 275 if (flock(diskfd, LOCK_EX)) { 276 msg("flock(LOCK_EX) failed: %s\n", 277 strerror(errno)); 278 rawread(blkno, buf, size); 279 return; 280 } 281 282 retry: 283 idx = 0; 284 while (size > 0) { 285 int i; 286 287 for (i = 0; i < cachebufs; i++) { 288 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 289 290 #ifdef DIAGNOSTICS 291 if (curr->cd_owner) { 292 fprintf(stderr, "Owner is set (%d, me=%d), can" 293 "not happen.\n", curr->cd_owner, getpid()); 294 } 295 #endif 296 297 if (curr->cd_blkend == 0) 298 continue; 299 /* 300 * If we find a bit of the read in the buffers, 301 * now compute how many blocks we can copy, 302 * copy them out, adjust blkno, buf and size, 303 * and restart 304 */ 305 if (curr->cd_blkstart <= blkno && 306 blkno < curr->cd_blkend) { 307 /* Number of data blocks to be copied */ 308 int toCopy = MIN(size, 309 (curr->cd_blkend - blkno) << dev_bshift); 310 #ifdef DIAGNOSTICS 311 if (toCopy <= 0 || toCopy > CSIZE) { 312 fprintf(stderr, "toCopy %d !\n", 313 toCopy); 314 dumpabort(0); 315 } 316 if (CDATA(curr) + 317 ((blkno - curr->cd_blkstart) << 318 dev_bshift) < CDATA(curr) || 319 CDATA(curr) + 320 ((blkno - curr->cd_blkstart) << 321 dev_bshift) > CDATA(curr) + CSIZE) { 322 fprintf(stderr, "%p < %p !!!\n", 323 CDATA(curr) + ((blkno - 324 curr->cd_blkstart) << dev_bshift), 325 CDATA(curr)); 326 fprintf(stderr, 327 "cdesc[i].cd_blkstart %lld " 328 "blkno %lld dev_bsize %ld\n", 329 (long long)curr->cd_blkstart, 330 (long long)blkno, 331 dev_bsize); 332 dumpabort(0); 333 } 334 #endif 335 memcpy(buf, CDATA(curr) + 336 ((blkno - curr->cd_blkstart) << 337 dev_bshift), 338 toCopy); 339 340 buf += toCopy; 341 size -= toCopy; 342 blkno += howmany(toCopy, dev_bsize); 343 numBlocks -= howmany(toCopy, dev_bsize); 344 345 curr->cd_time = cheader->cd_count++; 346 347 /* 348 * If all data of a cache block have been 349 * read, chances are good no more reads 350 * will occur, so expire the cache immediately 351 */ 352 353 curr->cd_blocksRead += 354 howmany(toCopy, dev_bsize); 355 if (curr->cd_blocksRead >= nblksread) 356 curr->cd_time = 0; 357 358 goto retry; 359 } 360 } 361 362 /* No more to do? */ 363 if (size == 0) 364 break; 365 366 /* 367 * This does actually not happen if fs blocks are not greater 368 * than nblksread. 369 */ 370 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 371 rawread(oblkno, obuf, osize); 372 break; 373 } else { 374 ssize_t rsize; 375 daddr_t blockBlkNo; 376 377 blockBlkNo = (blkno / nblksread) * nblksread; 378 idx = findlru(); 379 rsize = MIN(nblksread, 380 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 381 382 #ifdef DIAGNOSTICS 383 if (cdesc[idx].cd_owner) 384 fprintf(stderr, "Owner is set (%d, me=%d), can" 385 "not happen(2).\n", cdesc[idx].cd_owner, 386 getpid()); 387 cdesc[idx].cd_owner = getpid(); 388 #endif 389 cdesc[idx].cd_time = cheader->cd_count++; 390 cdesc[idx].cd_blkstart = blockBlkNo; 391 cdesc[idx].cd_blkend = 0; 392 cdesc[idx].cd_blocksRead = 0; 393 394 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 395 SEEK_SET) == -1) { 396 msg("readBlocks: lseek fails: %s\n", 397 strerror(errno)); 398 rsize = -1; 399 } else { 400 rsize = read(diskfd, 401 CDATA(&cdesc[idx]), rsize); 402 if (rsize < 0) { 403 msg("readBlocks: read fails: %s\n", 404 strerror(errno)); 405 } 406 } 407 408 /* On errors, panic, punt, try to read without 409 * cache and let raw read routine do the rest. 410 */ 411 412 if (rsize <= 0) { 413 rawread(oblkno, obuf, osize); 414 #ifdef DIAGNOSTICS 415 if (cdesc[idx].cd_owner != getpid()) 416 fprintf(stderr, "Owner changed from " 417 "%d to %d, can't happen\n", 418 getpid(), cdesc[idx].cd_owner); 419 cdesc[idx].cd_owner = 0; 420 #endif 421 break; 422 } 423 424 /* On short read, just note the fact and go on */ 425 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 426 427 #ifdef STATS 428 nphysread++; 429 physreadsize += rsize; 430 #endif 431 #ifdef DIAGNOSTICS 432 if (cdesc[idx].cd_owner != getpid()) 433 fprintf(stderr, "Owner changed from " 434 "%d to %d, can't happen\n", 435 getpid(), cdesc[idx].cd_owner); 436 cdesc[idx].cd_owner = 0; 437 #endif 438 /* 439 * We swapped some of data in, let the loop fetch 440 * them from cache 441 */ 442 } 443 } 444 445 if (flock(diskfd, LOCK_UN)) 446 msg("flock(LOCK_UN) failed: %s\n", 447 strerror(errno)); 448 } 449 450 void 451 printcachestats(void) 452 { 453 454 #ifdef STATS 455 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 456 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 457 getpid(), nreads, (u_int) readsize, nphysread, 458 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 459 (int) (((physreadsize - readsize) * 100) / readsize)); 460 #endif 461 } 462