1 /* $NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __RCSID("$NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $"); 36 #endif /* not lint */ 37 38 #include <sys/types.h> 39 #include <sys/uio.h> 40 #include <sys/mman.h> 41 #include <sys/param.h> 42 #include <sys/sysctl.h> 43 #include <ufs/ufs/dinode.h> 44 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <unistd.h> 48 #include <fcntl.h> 49 #include <errno.h> 50 #include <string.h> 51 52 #include "dump.h" 53 54 /*-----------------------------------------------------------------------*/ 55 #define MAXCACHEBUFS 512 /* max 512 buffers */ 56 #define MAXMEMPART 6 /* max 15% of the user mem */ 57 58 /*-----------------------------------------------------------------------*/ 59 union cdesc { 60 volatile size_t cd_count; 61 struct { 62 volatile daddr_t blkstart; 63 volatile daddr_t blkend; /* start + nblksread */ 64 volatile daddr_t blocksRead; 65 volatile size_t time; 66 #ifdef DIAGNOSTICS 67 volatile pid_t owner; 68 #endif 69 } desc; 70 #define cd_blkstart desc.blkstart 71 #define cd_blkend desc.blkend 72 #define cd_blocksRead desc.blocksRead 73 #define cd_time desc.time 74 #define cd_owner desc.owner 75 }; 76 77 static int findlru(void); 78 79 static void *shareBuffer = NULL; 80 static union cdesc *cheader; 81 static union cdesc *cdesc; 82 static char *cdata; 83 static int cachebufs; 84 static int nblksread; 85 86 #ifdef STATS 87 static int nreads; 88 static int nphysread; 89 static int64_t readsize; 90 static int64_t physreadsize; 91 #endif 92 93 #define CSIZE (nblksread << dev_bshift) /* cache buf size */ 94 #define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 95 96 void 97 initcache(int cachesize, int readblksize) 98 { 99 size_t len; 100 size_t sharedSize; 101 102 /* Convert read block size in terms of filesystem block size */ 103 nblksread = howmany(readblksize, ufsib->ufs_bsize); 104 105 /* Then, convert it in terms of device block size */ 106 nblksread <<= ufsib->ufs_bshift - dev_bshift; 107 108 if (cachesize == -1) { /* Compute from memory available */ 109 uint64_t usermem; 110 int mib[2] = { CTL_HW, HW_USERMEM64 }; 111 112 len = sizeof(usermem); 113 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 114 msg("sysctl(hw.usermem) failed: %s\n", 115 strerror(errno)); 116 return; 117 } 118 cachebufs = (usermem / MAXMEMPART) / CSIZE; 119 } else { /* User specified */ 120 cachebufs = cachesize; 121 } 122 123 if (cachebufs) { /* Don't allocate if zero --> no caching */ 124 if (cachebufs > MAXCACHEBUFS) 125 cachebufs = MAXCACHEBUFS; 126 127 sharedSize = sizeof(union cdesc) + 128 sizeof(union cdesc) * cachebufs + 129 cachebufs * CSIZE; 130 #ifdef STATS 131 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 132 sharedSize); 133 #endif 134 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 135 MAP_ANON | MAP_SHARED, -1, 0); 136 if (shareBuffer == MAP_FAILED) { 137 msg("can't mmap shared memory for buffer: %s\n", 138 strerror(errno)); 139 return; 140 } 141 cheader = shareBuffer; 142 cdesc = (union cdesc *) (((char *) shareBuffer) + 143 sizeof(union cdesc)); 144 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 145 sizeof(union cdesc) * cachebufs; 146 147 memset(shareBuffer, '\0', sharedSize); 148 } 149 } 150 151 /* 152 * Find the cache buffer descriptor that shows the minimal access time 153 */ 154 static int 155 findlru(void) 156 { 157 int i; 158 size_t minTime = cdesc[0].cd_time; 159 int minIdx = 0; 160 161 for (i = 0; i < cachebufs; i++) { 162 if (cdesc[i].cd_time < minTime) { 163 minIdx = i; 164 minTime = cdesc[i].cd_time; 165 } 166 } 167 168 return minIdx; 169 } 170 171 /* 172 * Read data directly from disk, with smart error handling. 173 * Try to recover from hard errors by reading in sector sized pieces. 174 * Error recovery is attempted at most BREADEMAX times before seeking 175 * consent from the operator to continue. 176 */ 177 178 static int breaderrors = 0; 179 #define BREADEMAX 32 180 181 void 182 rawread(daddr_t blkno, char *buf, int size) 183 { 184 int cnt, i; 185 186 #ifdef STATS 187 nphysread++; 188 physreadsize += size; 189 #endif 190 191 loop: 192 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) { 193 msg("rawread: lseek fails\n"); 194 goto err; 195 } 196 if ((cnt = read(diskfd, buf, size)) == size) 197 return; 198 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 199 /* 200 * Trying to read the final fragment. 201 * 202 * NB - dump only works in TP_BSIZE blocks, hence 203 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 204 * It should be smarter about not actually trying to 205 * read more than it can get, but for the time being 206 * we punt and scale back the read only when it gets 207 * us into trouble. (mkm 9/25/83) 208 */ 209 size -= dev_bsize; 210 goto loop; 211 } 212 if (cnt == -1) 213 msg("read error from %s: %s: [block %lld]: count=%d\n", 214 disk, strerror(errno), (long long)blkno, size); 215 else 216 msg("short read error from %s: [block %lld]: " 217 "count=%d, got=%d\n", 218 disk, (long long)blkno, size, cnt); 219 err: 220 if (++breaderrors > BREADEMAX) { 221 msg("More than %d block read errors from %s\n", 222 BREADEMAX, disk); 223 broadcast("DUMP IS AILING!\n"); 224 msg("This is an unrecoverable error.\n"); 225 if (!query("Do you want to attempt to continue?")) { 226 dumpabort(0); 227 /*NOTREACHED*/ 228 } else 229 breaderrors = 0; 230 } 231 /* 232 * Zero buffer, then try to read each sector of buffer separately. 233 */ 234 memset(buf, 0, size); 235 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 236 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 237 SEEK_SET) == -1) { 238 msg("rawread: lseek2 fails: %s!\n", 239 strerror(errno)); 240 continue; 241 } 242 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 243 continue; 244 if (cnt == -1) { 245 msg("read error from %s: %s: [sector %lld]: " 246 "count=%ld\n", disk, strerror(errno), 247 (long long)blkno, dev_bsize); 248 continue; 249 } 250 msg("short read error from %s: [sector %lld]: " 251 "count=%ld, got=%d\n", 252 disk, (long long)blkno, dev_bsize, cnt); 253 } 254 } 255 256 void 257 bread(daddr_t blkno, char *buf, int size) 258 { 259 int osize = size, idx; 260 daddr_t oblkno = blkno; 261 char *obuf = buf; 262 daddr_t numBlocks = howmany(size, dev_bsize); 263 264 #ifdef STATS 265 nreads++; 266 readsize += size; 267 #endif 268 269 if (!shareBuffer) { 270 rawread(blkno, buf, size); 271 return; 272 } 273 274 if (flock(diskfd, LOCK_EX)) { 275 msg("flock(LOCK_EX) failed: %s\n", 276 strerror(errno)); 277 rawread(blkno, buf, size); 278 return; 279 } 280 281 retry: 282 idx = 0; 283 while (size > 0) { 284 int i; 285 286 for (i = 0; i < cachebufs; i++) { 287 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 288 289 #ifdef DIAGNOSTICS 290 if (curr->cd_owner) { 291 fprintf(stderr, "Owner is set (%d, me=%d), can" 292 "not happen.\n", curr->cd_owner, getpid()); 293 } 294 #endif 295 296 if (curr->cd_blkend == 0) 297 continue; 298 /* 299 * If we find a bit of the read in the buffers, 300 * now compute how many blocks we can copy, 301 * copy them out, adjust blkno, buf and size, 302 * and restart 303 */ 304 if (curr->cd_blkstart <= blkno && 305 blkno < curr->cd_blkend) { 306 /* Number of data blocks to be copied */ 307 int toCopy = MIN(size, 308 (curr->cd_blkend - blkno) << dev_bshift); 309 #ifdef DIAGNOSTICS 310 if (toCopy <= 0 || toCopy > CSIZE) { 311 fprintf(stderr, "toCopy %d !\n", 312 toCopy); 313 dumpabort(0); 314 } 315 if (CDATA(curr) + 316 ((blkno - curr->cd_blkstart) << 317 dev_bshift) < CDATA(curr) || 318 CDATA(curr) + 319 ((blkno - curr->cd_blkstart) << 320 dev_bshift) > CDATA(curr) + CSIZE) { 321 fprintf(stderr, "%p < %p !!!\n", 322 CDATA(curr) + ((blkno - 323 curr->cd_blkstart) << dev_bshift), 324 CDATA(curr)); 325 fprintf(stderr, 326 "cdesc[i].cd_blkstart %lld " 327 "blkno %lld dev_bsize %ld\n", 328 (long long)curr->cd_blkstart, 329 (long long)blkno, 330 dev_bsize); 331 dumpabort(0); 332 } 333 #endif 334 memcpy(buf, CDATA(curr) + 335 ((blkno - curr->cd_blkstart) << 336 dev_bshift), 337 toCopy); 338 339 buf += toCopy; 340 size -= toCopy; 341 blkno += howmany(toCopy, dev_bsize); 342 numBlocks -= howmany(toCopy, dev_bsize); 343 344 curr->cd_time = cheader->cd_count++; 345 346 /* 347 * If all data of a cache block have been 348 * read, chances are good no more reads 349 * will occur, so expire the cache immediately 350 */ 351 352 curr->cd_blocksRead += 353 howmany(toCopy, dev_bsize); 354 if (curr->cd_blocksRead >= nblksread) 355 curr->cd_time = 0; 356 357 goto retry; 358 } 359 } 360 361 /* No more to do? */ 362 if (size == 0) 363 break; 364 365 /* 366 * This does actually not happen if fs blocks are not greater 367 * than nblksread. 368 */ 369 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 370 rawread(oblkno, obuf, osize); 371 break; 372 } else { 373 ssize_t rsize; 374 daddr_t blockBlkNo; 375 376 blockBlkNo = (blkno / nblksread) * nblksread; 377 idx = findlru(); 378 rsize = MIN(nblksread, 379 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 380 381 #ifdef DIAGNOSTICS 382 if (cdesc[idx].cd_owner) 383 fprintf(stderr, "Owner is set (%d, me=%d), can" 384 "not happen(2).\n", cdesc[idx].cd_owner, 385 getpid()); 386 cdesc[idx].cd_owner = getpid(); 387 #endif 388 cdesc[idx].cd_time = cheader->cd_count++; 389 cdesc[idx].cd_blkstart = blockBlkNo; 390 cdesc[idx].cd_blkend = 0; 391 cdesc[idx].cd_blocksRead = 0; 392 393 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 394 SEEK_SET) == -1) { 395 msg("readBlocks: lseek fails: %s\n", 396 strerror(errno)); 397 rsize = -1; 398 } else { 399 rsize = read(diskfd, 400 CDATA(&cdesc[idx]), rsize); 401 if (rsize < 0) { 402 msg("readBlocks: read fails: %s\n", 403 strerror(errno)); 404 } 405 } 406 407 /* On errors, panic, punt, try to read without 408 * cache and let raw read routine do the rest. 409 */ 410 411 if (rsize <= 0) { 412 rawread(oblkno, obuf, osize); 413 #ifdef DIAGNOSTICS 414 if (cdesc[idx].cd_owner != getpid()) 415 fprintf(stderr, "Owner changed from " 416 "%d to %d, can't happen\n", 417 getpid(), cdesc[idx].cd_owner); 418 cdesc[idx].cd_owner = 0; 419 #endif 420 break; 421 } 422 423 /* On short read, just note the fact and go on */ 424 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 425 426 #ifdef STATS 427 nphysread++; 428 physreadsize += rsize; 429 #endif 430 #ifdef DIAGNOSTICS 431 if (cdesc[idx].cd_owner != getpid()) 432 fprintf(stderr, "Owner changed from " 433 "%d to %d, can't happen\n", 434 getpid(), cdesc[idx].cd_owner); 435 cdesc[idx].cd_owner = 0; 436 #endif 437 /* 438 * We swapped some of data in, let the loop fetch 439 * them from cache 440 */ 441 } 442 } 443 444 if (flock(diskfd, LOCK_UN)) 445 msg("flock(LOCK_UN) failed: %s\n", 446 strerror(errno)); 447 } 448 449 void 450 printcachestats(void) 451 { 452 453 #ifdef STATS 454 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 455 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 456 getpid(), nreads, (u_int) readsize, nphysread, 457 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 458 (int) (((physreadsize - readsize) * 100) / readsize)); 459 #endif 460 } 461