1 /* $NetBSD: rcache.c,v 1.17 2003/02/04 08:43:16 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 #ifndef lint 42 __RCSID("$NetBSD: rcache.c,v 1.17 2003/02/04 08:43:16 enami Exp $"); 43 #endif /* not lint */ 44 45 #include <sys/types.h> 46 #include <sys/uio.h> 47 #include <sys/mman.h> 48 #include <sys/param.h> 49 #include <sys/sysctl.h> 50 #include <ufs/ufs/dinode.h> 51 52 #include <stdio.h> 53 #include <stdlib.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include <errno.h> 57 #include <string.h> 58 59 #include "dump.h" 60 61 /*-----------------------------------------------------------------------*/ 62 #define MAXCACHEBUFS 512 /* max 512 buffers */ 63 #define MAXMEMPART 6 /* max 15% of the user mem */ 64 65 /*-----------------------------------------------------------------------*/ 66 union cdesc { 67 volatile size_t cd_count; 68 struct { 69 volatile daddr_t blkstart; 70 volatile daddr_t blkend; /* start + nblksread */ 71 volatile daddr_t blocksRead; 72 volatile size_t time; 73 #ifdef DIAGNOSTICS 74 volatile pid_t owner; 75 #endif 76 } desc; 77 #define cd_blkstart desc.blkstart 78 #define cd_blkend desc.blkend 79 #define cd_blocksRead desc.blocksRead 80 #define cd_time desc.time 81 #define cd_owner desc.owner 82 }; 83 84 static int findlru(void); 85 86 static void *shareBuffer = NULL; 87 static union cdesc *cheader; 88 static union cdesc *cdesc; 89 static char *cdata; 90 static int cachebufs; 91 static int nblksread; 92 93 #ifdef STATS 94 static int nreads; 95 static int nphysread; 96 static int64_t readsize; 97 static int64_t physreadsize; 98 #endif 99 100 #define CSIZE (nblksread << dev_bshift) /* cache buf size */ 101 #define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 102 103 void 104 initcache(int cachesize, int readblksize) 105 { 106 size_t len; 107 size_t sharedSize; 108 109 /* Convert read block size in terms of filesystem block size */ 110 nblksread = howmany(readblksize, ufsib->ufs_bsize); 111 112 /* Then, convert it in terms of device block size */ 113 nblksread <<= ufsib->ufs_bshift - dev_bshift; 114 115 if (cachesize == -1) { /* Compute from memory available */ 116 int usermem; 117 int mib[2] = { CTL_HW, HW_USERMEM }; 118 119 len = sizeof(usermem); 120 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 121 msg("sysctl(hw.usermem) failed: %s\n", 122 strerror(errno)); 123 return; 124 } 125 cachebufs = (usermem / MAXMEMPART) / CSIZE; 126 } else { /* User specified */ 127 cachebufs = cachesize; 128 } 129 130 if (cachebufs) { /* Don't allocate if zero --> no caching */ 131 if (cachebufs > MAXCACHEBUFS) 132 cachebufs = MAXCACHEBUFS; 133 134 sharedSize = sizeof(union cdesc) + 135 sizeof(union cdesc) * cachebufs + 136 cachebufs * CSIZE; 137 #ifdef STATS 138 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 139 sharedSize); 140 #endif 141 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 142 MAP_ANON | MAP_SHARED, -1, 0); 143 if (shareBuffer == MAP_FAILED) { 144 msg("can't mmap shared memory for buffer: %s\n", 145 strerror(errno)); 146 return; 147 } 148 cheader = shareBuffer; 149 cdesc = (union cdesc *) (((char *) shareBuffer) + 150 sizeof(union cdesc)); 151 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 152 sizeof(union cdesc) * cachebufs; 153 154 memset(shareBuffer, '\0', sharedSize); 155 } 156 } 157 158 /* 159 * Find the cache buffer descriptor that shows the minimal access time 160 */ 161 static int 162 findlru(void) 163 { 164 int i; 165 size_t minTime = cdesc[0].cd_time; 166 int minIdx = 0; 167 168 for (i = 0; i < cachebufs; i++) { 169 if (cdesc[i].cd_time < minTime) { 170 minIdx = i; 171 minTime = cdesc[i].cd_time; 172 } 173 } 174 175 return minIdx; 176 } 177 178 /* 179 * Read data directly from disk, with smart error handling. 180 * Try to recover from hard errors by reading in sector sized pieces. 181 * Error recovery is attempted at most BREADEMAX times before seeking 182 * consent from the operator to continue. 183 */ 184 185 static int breaderrors = 0; 186 #define BREADEMAX 32 187 188 void 189 rawread(daddr_t blkno, char *buf, int size) 190 { 191 int cnt, i; 192 193 #ifdef STATS 194 nphysread++; 195 physreadsize += size; 196 #endif 197 198 loop: 199 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) < 0) { 200 msg("rawread: lseek fails\n"); 201 goto err; 202 } 203 if ((cnt = read(diskfd, buf, size)) == size) 204 return; 205 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 206 /* 207 * Trying to read the final fragment. 208 * 209 * NB - dump only works in TP_BSIZE blocks, hence 210 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 211 * It should be smarter about not actually trying to 212 * read more than it can get, but for the time being 213 * we punt and scale back the read only when it gets 214 * us into trouble. (mkm 9/25/83) 215 */ 216 size -= dev_bsize; 217 goto loop; 218 } 219 if (cnt == -1) 220 msg("read error from %s: %s: [block %lld]: count=%d\n", 221 disk, strerror(errno), (long long)blkno, size); 222 else 223 msg("short read error from %s: [block %lld]: " 224 "count=%d, got=%d\n", 225 disk, (long long)blkno, size, cnt); 226 err: 227 if (++breaderrors > BREADEMAX) { 228 msg("More than %d block read errors from %s\n", 229 BREADEMAX, disk); 230 broadcast("DUMP IS AILING!\n"); 231 msg("This is an unrecoverable error.\n"); 232 if (!query("Do you want to attempt to continue?")) { 233 dumpabort(0); 234 /*NOTREACHED*/ 235 } else 236 breaderrors = 0; 237 } 238 /* 239 * Zero buffer, then try to read each sector of buffer separately. 240 */ 241 memset(buf, 0, size); 242 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 243 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 244 SEEK_SET) < 0) { 245 msg("rawread: lseek2 fails: %s!\n", 246 strerror(errno)); 247 continue; 248 } 249 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 250 continue; 251 if (cnt == -1) { 252 msg("read error from %s: %s: [sector %lld]: " 253 "count=%ld: %s\n", disk, strerror(errno), 254 (long long)blkno, 255 dev_bsize, strerror(errno)); 256 continue; 257 } 258 msg("short read error from %s: [sector %lld]: " 259 "count=%ld, got=%d\n", 260 disk, (long long)blkno, dev_bsize, cnt); 261 } 262 } 263 264 void 265 bread(daddr_t blkno, char *buf, int size) 266 { 267 int osize = size, idx; 268 daddr_t oblkno = blkno; 269 char *obuf = buf; 270 daddr_t numBlocks = howmany(size, dev_bsize); 271 272 #ifdef STATS 273 nreads++; 274 readsize += size; 275 #endif 276 277 if (!shareBuffer) { 278 rawread(blkno, buf, size); 279 return; 280 } 281 282 if (flock(diskfd, LOCK_EX)) { 283 msg("flock(LOCK_EX) failed: %s\n", 284 strerror(errno)); 285 rawread(blkno, buf, size); 286 return; 287 } 288 289 retry: 290 idx = 0; 291 while (size > 0) { 292 int i; 293 294 for (i = 0; i < cachebufs; i++) { 295 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 296 297 #ifdef DIAGNOSTICS 298 if (curr->cd_owner) { 299 fprintf(stderr, "Owner is set (%d, me=%d), can" 300 "not happen.\n", curr->cd_owner, getpid()); 301 } 302 #endif 303 304 if (curr->cd_blkend == 0) 305 continue; 306 /* 307 * If we find a bit of the read in the buffers, 308 * now compute how many blocks we can copy, 309 * copy them out, adjust blkno, buf and size, 310 * and restart 311 */ 312 if (curr->cd_blkstart <= blkno && 313 blkno < curr->cd_blkend) { 314 /* Number of data blocks to be copied */ 315 int toCopy = MIN(size, 316 (curr->cd_blkend - blkno) << dev_bshift); 317 #ifdef DIAGNOSTICS 318 if (toCopy <= 0 || toCopy > CSIZE) { 319 fprintf(stderr, "toCopy %d !\n", 320 toCopy); 321 dumpabort(0); 322 } 323 if (CDATA(curr) + 324 ((blkno - curr->cd_blkstart) << 325 dev_bshift) < CDATA(curr) || 326 CDATA(curr) + 327 ((blkno - curr->cd_blkstart) << 328 dev_bshift) > CDATA(curr) + CSIZE) { 329 fprintf(stderr, "%p < %p !!!\n", 330 CDATA(curr) + ((blkno - 331 curr->cd_blkstart) << dev_bshift), 332 CDATA(curr)); 333 fprintf(stderr, 334 "cdesc[i].cd_blkstart %lld " 335 "blkno %lld dev_bsize %ld\n", 336 (long long)curr->cd_blkstart, 337 (long long)blkno, 338 dev_bsize); 339 dumpabort(0); 340 } 341 #endif 342 memcpy(buf, CDATA(curr) + 343 ((blkno - curr->cd_blkstart) << 344 dev_bshift), 345 toCopy); 346 347 buf += toCopy; 348 size -= toCopy; 349 blkno += howmany(toCopy, dev_bsize); 350 numBlocks -= howmany(toCopy, dev_bsize); 351 352 curr->cd_time = cheader->cd_count++; 353 354 /* 355 * If all data of a cache block have been 356 * read, chances are good no more reads 357 * will occur, so expire the cache immediately 358 */ 359 360 curr->cd_blocksRead += 361 howmany(toCopy, dev_bsize); 362 if (curr->cd_blocksRead >= nblksread) 363 curr->cd_time = 0; 364 365 goto retry; 366 } 367 } 368 369 /* No more to do? */ 370 if (size == 0) 371 break; 372 373 /* 374 * This does actually not happen if fs blocks are not greater 375 * than nblksread. 376 */ 377 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 378 rawread(oblkno, obuf, osize); 379 break; 380 } else { 381 ssize_t rsize; 382 daddr_t blockBlkNo; 383 384 blockBlkNo = (blkno / nblksread) * nblksread; 385 idx = findlru(); 386 rsize = MIN(nblksread, 387 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 388 389 #ifdef DIAGNOSTICS 390 if (cdesc[idx].cd_owner) 391 fprintf(stderr, "Owner is set (%d, me=%d), can" 392 "not happen(2).\n", cdesc[idx].cd_owner, 393 getpid()); 394 cdesc[idx].cd_owner = getpid(); 395 #endif 396 cdesc[idx].cd_time = cheader->cd_count++; 397 cdesc[idx].cd_blkstart = blockBlkNo; 398 cdesc[idx].cd_blkend = 0; 399 cdesc[idx].cd_blocksRead = 0; 400 401 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 402 SEEK_SET) < 0) { 403 msg("readBlocks: lseek fails: %s\n", 404 strerror(errno)); 405 rsize = -1; 406 } else { 407 rsize = read(diskfd, 408 CDATA(&cdesc[idx]), rsize); 409 if (rsize < 0) { 410 msg("readBlocks: read fails: %s\n", 411 strerror(errno)); 412 } 413 } 414 415 /* On errors, panic, punt, try to read without 416 * cache and let raw read routine do the rest. 417 */ 418 419 if (rsize <= 0) { 420 rawread(oblkno, obuf, osize); 421 #ifdef DIAGNOSTICS 422 if (cdesc[idx].cd_owner != getpid()) 423 fprintf(stderr, "Owner changed from " 424 "%d to %d, can't happen\n", 425 getpid(), cdesc[idx].cd_owner); 426 cdesc[idx].cd_owner = 0; 427 #endif 428 break; 429 } 430 431 /* On short read, just note the fact and go on */ 432 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 433 434 #ifdef STATS 435 nphysread++; 436 physreadsize += rsize; 437 #endif 438 #ifdef DIAGNOSTICS 439 if (cdesc[idx].cd_owner != getpid()) 440 fprintf(stderr, "Owner changed from " 441 "%d to %d, can't happen\n", 442 getpid(), cdesc[idx].cd_owner); 443 cdesc[idx].cd_owner = 0; 444 #endif 445 /* 446 * We swapped some of data in, let the loop fetch 447 * them from cache 448 */ 449 } 450 } 451 452 if (flock(diskfd, LOCK_UN)) 453 msg("flock(LOCK_UN) failed: %s\n", 454 strerror(errno)); 455 } 456 457 void 458 printcachestats(void) 459 { 460 461 #ifdef STATS 462 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 463 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 464 getpid(), nreads, (u_int) readsize, nphysread, 465 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 466 (int) (((physreadsize - readsize) * 100) / readsize)); 467 #endif 468 } 469