1433d6423SLionel Sambuc 2433d6423SLionel Sambuc #define _SYSTEM 3433d6423SLionel Sambuc 4433d6423SLionel Sambuc #include <assert.h> 56c46a77dSDavid van Moolenbroek #include <string.h> 6433d6423SLionel Sambuc #include <errno.h> 7433d6423SLionel Sambuc #include <math.h> 8433d6423SLionel Sambuc #include <stdlib.h> 9433d6423SLionel Sambuc 10433d6423SLionel Sambuc #include <machine/vmparam.h> 11433d6423SLionel Sambuc 12433d6423SLionel Sambuc #include <sys/param.h> 13433d6423SLionel Sambuc #include <sys/mman.h> 14433d6423SLionel Sambuc 15433d6423SLionel Sambuc #include <minix/dmap.h> 16433d6423SLionel Sambuc #include <minix/libminixfs.h> 17433d6423SLionel Sambuc #include <minix/syslib.h> 18433d6423SLionel Sambuc #include <minix/sysutil.h> 19433d6423SLionel Sambuc #include <minix/u64.h> 20433d6423SLionel Sambuc #include <minix/bdev.h> 214472b590SDavid van Moolenbroek #include <minix/bitmap.h> 22433d6423SLionel Sambuc 236c46a77dSDavid van Moolenbroek #include "inc.h" 246c46a77dSDavid van Moolenbroek 250314acfbSDavid van Moolenbroek /* Buffer (block) cache. To acquire a block, a routine calls lmfs_get_block(), 260314acfbSDavid van Moolenbroek * telling which block it wants. The block is then regarded as "in use" and 270314acfbSDavid van Moolenbroek * has its reference count incremented. All the blocks that are not in use are 280314acfbSDavid van Moolenbroek * chained together in an LRU list, with 'front' pointing to the least recently 290314acfbSDavid van Moolenbroek * used block, and 'rear' to the most recently used block. A reverse chain is 300314acfbSDavid van Moolenbroek * also maintained. Usage for LRU is measured by the time the put_block() is 310314acfbSDavid van Moolenbroek * done. The second parameter to put_block() can violate the LRU order and put 320314acfbSDavid van Moolenbroek * a block on the front of the list, if it will probably not be needed again. 330314acfbSDavid van Moolenbroek * This is used internally only; the lmfs_put_block() API call has no second 340314acfbSDavid van Moolenbroek * parameter. If a block is modified, the modifying routine must mark the 350314acfbSDavid van Moolenbroek * block as dirty, so the block will eventually be rewritten to the disk. 360314acfbSDavid van Moolenbroek */ 370314acfbSDavid van Moolenbroek 380314acfbSDavid van Moolenbroek /* Flags to put_block(). */ 390314acfbSDavid van Moolenbroek #define ONE_SHOT 0x1 /* set if block will not be needed again */ 400314acfbSDavid van Moolenbroek 41b65ad59eSDavid van Moolenbroek #define BUFHASH(b) ((unsigned int)((b) % nr_bufs)) 42433d6423SLionel Sambuc #define MARKCLEAN lmfs_markclean 43433d6423SLionel Sambuc 44433d6423SLionel Sambuc #define MINBUFS 6 /* minimal no of bufs for sanity check */ 45433d6423SLionel Sambuc 46433d6423SLionel Sambuc static struct buf *front; /* points to least recently used free block */ 47433d6423SLionel Sambuc static struct buf *rear; /* points to most recently used free block */ 48433d6423SLionel Sambuc static unsigned int bufs_in_use;/* # bufs currently in use (not on free list)*/ 49433d6423SLionel Sambuc 50433d6423SLionel Sambuc static void rm_lru(struct buf *bp); 516c46a77dSDavid van Moolenbroek static int read_block(struct buf *bp, size_t size); 52433d6423SLionel Sambuc static void freeblock(struct buf *bp); 530314acfbSDavid van Moolenbroek static void cache_heuristic_check(void); 540314acfbSDavid van Moolenbroek static void put_block(struct buf *bp, int put_flags); 55433d6423SLionel Sambuc 56433d6423SLionel Sambuc static int vmcache = 0; /* are we using vm's secondary cache? (initially not) */ 57433d6423SLionel Sambuc 58433d6423SLionel Sambuc static struct buf *buf; 59433d6423SLionel Sambuc static struct buf **buf_hash; /* the buffer hash table */ 60433d6423SLionel Sambuc static unsigned int nr_bufs; 61433d6423SLionel Sambuc static int may_use_vmcache; 62433d6423SLionel Sambuc 6365f76edbSDavid van Moolenbroek static size_t fs_block_size = PAGE_SIZE; /* raw i/o block size */ 64433d6423SLionel Sambuc 651311233cSDavid van Moolenbroek static fsblkcnt_t fs_btotal = 0, fs_bused = 0; 661311233cSDavid van Moolenbroek 67433d6423SLionel Sambuc static int quiet = 0; 68433d6423SLionel Sambuc 69*129adfebSDavid van Moolenbroek typedef struct buf *noxfer_buf_ptr_t; /* annotation for temporary buf ptrs */ 70*129adfebSDavid van Moolenbroek 71433d6423SLionel Sambuc void lmfs_setquiet(int q) { quiet = q; } 72433d6423SLionel Sambuc 731311233cSDavid van Moolenbroek static int fs_bufs_heuristic(int minbufs, fsblkcnt_t btotal, 741311233cSDavid van Moolenbroek fsblkcnt_t bused, int blocksize) 75433d6423SLionel Sambuc { 76433d6423SLionel Sambuc struct vm_stats_info vsi; 77433d6423SLionel Sambuc int bufs; 78433d6423SLionel Sambuc u32_t kbytes_used_fs, kbytes_total_fs, kbcache, kb_fsmax; 79433d6423SLionel Sambuc u32_t kbytes_remain_mem; 80433d6423SLionel Sambuc 81433d6423SLionel Sambuc /* set a reasonable cache size; cache at most a certain 82433d6423SLionel Sambuc * portion of the used FS, and at most a certain %age of remaining 83433d6423SLionel Sambuc * memory 84433d6423SLionel Sambuc */ 85433d6423SLionel Sambuc if(vm_info_stats(&vsi) != OK) { 86433d6423SLionel Sambuc bufs = 1024; 87433d6423SLionel Sambuc if(!quiet) 88433d6423SLionel Sambuc printf("fslib: heuristic info fail: default to %d bufs\n", bufs); 89433d6423SLionel Sambuc return bufs; 90433d6423SLionel Sambuc } 91433d6423SLionel Sambuc 92433d6423SLionel Sambuc /* remaining free memory is unused memory plus memory in used for cache, 93433d6423SLionel Sambuc * as the cache can be evicted 94433d6423SLionel Sambuc */ 95433d6423SLionel Sambuc kbytes_remain_mem = (u64_t)(vsi.vsi_free + vsi.vsi_cached) * 96433d6423SLionel Sambuc vsi.vsi_pagesize / 1024; 97433d6423SLionel Sambuc 98433d6423SLionel Sambuc /* check fs usage. */ 99433d6423SLionel Sambuc kbytes_used_fs = (unsigned long)(((u64_t)bused * blocksize) / 1024); 100433d6423SLionel Sambuc kbytes_total_fs = (unsigned long)(((u64_t)btotal * blocksize) / 1024); 101433d6423SLionel Sambuc 102433d6423SLionel Sambuc /* heuristic for a desired cache size based on FS usage; 103433d6423SLionel Sambuc * but never bigger than half of the total filesystem 104433d6423SLionel Sambuc */ 105433d6423SLionel Sambuc kb_fsmax = sqrt_approx(kbytes_used_fs)*40; 106433d6423SLionel Sambuc kb_fsmax = MIN(kb_fsmax, kbytes_total_fs/2); 107433d6423SLionel Sambuc 108433d6423SLionel Sambuc /* heuristic for a maximum usage - 10% of remaining memory */ 109433d6423SLionel Sambuc kbcache = MIN(kbytes_remain_mem/10, kb_fsmax); 110433d6423SLionel Sambuc bufs = kbcache * 1024 / blocksize; 111433d6423SLionel Sambuc 112433d6423SLionel Sambuc /* but we simply need MINBUFS no matter what */ 113433d6423SLionel Sambuc if(bufs < minbufs) 114433d6423SLionel Sambuc bufs = minbufs; 115433d6423SLionel Sambuc 116433d6423SLionel Sambuc return bufs; 117433d6423SLionel Sambuc } 118433d6423SLionel Sambuc 1191311233cSDavid van Moolenbroek void lmfs_change_blockusage(int delta) 120433d6423SLionel Sambuc { 121433d6423SLionel Sambuc /* Change the number of allocated blocks by 'delta.' 122433d6423SLionel Sambuc * Also accumulate the delta since the last cache re-evaluation. 123433d6423SLionel Sambuc * If it is outside a certain band, ask the cache library to 124433d6423SLionel Sambuc * re-evaluate the cache size. 125433d6423SLionel Sambuc */ 1261311233cSDavid van Moolenbroek static int bitdelta = 0, warn_low = TRUE, warn_high = TRUE; 1271311233cSDavid van Moolenbroek 1281311233cSDavid van Moolenbroek /* Adjust the file system block usage counter accordingly. Do bounds 1291311233cSDavid van Moolenbroek * checking, and report file system misbehavior. 1301311233cSDavid van Moolenbroek */ 1311311233cSDavid van Moolenbroek if (delta > 0 && (fsblkcnt_t)delta > fs_btotal - fs_bused) { 1321311233cSDavid van Moolenbroek if (warn_high) { 1331311233cSDavid van Moolenbroek printf("libminixfs: block usage overflow\n"); 1341311233cSDavid van Moolenbroek warn_high = FALSE; 1351311233cSDavid van Moolenbroek } 1361311233cSDavid van Moolenbroek delta = (int)(fs_btotal - fs_bused); 1371311233cSDavid van Moolenbroek } else if (delta < 0 && (fsblkcnt_t)-delta > fs_bused) { 1381311233cSDavid van Moolenbroek if (warn_low) { 1391311233cSDavid van Moolenbroek printf("libminixfs: block usage underflow\n"); 1401311233cSDavid van Moolenbroek warn_low = FALSE; 1411311233cSDavid van Moolenbroek } 1421311233cSDavid van Moolenbroek delta = -(int)fs_bused; 1431311233cSDavid van Moolenbroek } 1441311233cSDavid van Moolenbroek fs_bused += delta; 1451311233cSDavid van Moolenbroek 146433d6423SLionel Sambuc bitdelta += delta; 1471311233cSDavid van Moolenbroek 1481311233cSDavid van Moolenbroek #define BAND_KB (10*1024) /* recheck cache every 10MB change */ 1491311233cSDavid van Moolenbroek 1501311233cSDavid van Moolenbroek /* If the accumulated delta exceeds the configured threshold, resize 1511311233cSDavid van Moolenbroek * the cache, but only if the cache isn't in use any more. In order to 1521311233cSDavid van Moolenbroek * avoid that the latter case blocks a resize forever, we also call 1531311233cSDavid van Moolenbroek * this function from lmfs_flushall(). Since lmfs_buf_pool() may call 1541311233cSDavid van Moolenbroek * lmfs_flushall(), reset 'bitdelta' before doing the heuristics check. 1551311233cSDavid van Moolenbroek */ 1561311233cSDavid van Moolenbroek if (bufs_in_use == 0 && 1571311233cSDavid van Moolenbroek (bitdelta*(int)fs_block_size/1024 > BAND_KB || 1581311233cSDavid van Moolenbroek bitdelta*(int)fs_block_size/1024 < -BAND_KB)) { 159433d6423SLionel Sambuc bitdelta = 0; 1601311233cSDavid van Moolenbroek cache_heuristic_check(); 161433d6423SLionel Sambuc } 162433d6423SLionel Sambuc } 163433d6423SLionel Sambuc 164433d6423SLionel Sambuc void lmfs_markdirty(struct buf *bp) 165433d6423SLionel Sambuc { 166433d6423SLionel Sambuc bp->lmfs_flags |= VMMC_DIRTY; 167433d6423SLionel Sambuc } 168433d6423SLionel Sambuc 169433d6423SLionel Sambuc void lmfs_markclean(struct buf *bp) 170433d6423SLionel Sambuc { 171433d6423SLionel Sambuc bp->lmfs_flags &= ~VMMC_DIRTY; 172433d6423SLionel Sambuc } 173433d6423SLionel Sambuc 174433d6423SLionel Sambuc int lmfs_isclean(struct buf *bp) 175433d6423SLionel Sambuc { 176433d6423SLionel Sambuc return !(bp->lmfs_flags & VMMC_DIRTY); 177433d6423SLionel Sambuc } 178433d6423SLionel Sambuc 179433d6423SLionel Sambuc static void free_unused_blocks(void) 180433d6423SLionel Sambuc { 181433d6423SLionel Sambuc struct buf *bp; 182433d6423SLionel Sambuc 183433d6423SLionel Sambuc int freed = 0, bytes = 0; 184433d6423SLionel Sambuc printf("libminixfs: freeing; %d blocks in use\n", bufs_in_use); 185433d6423SLionel Sambuc for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++) { 186433d6423SLionel Sambuc if(bp->lmfs_bytes > 0 && bp->lmfs_count == 0) { 187433d6423SLionel Sambuc freed++; 188433d6423SLionel Sambuc bytes += bp->lmfs_bytes; 189433d6423SLionel Sambuc freeblock(bp); 190433d6423SLionel Sambuc } 191433d6423SLionel Sambuc } 192433d6423SLionel Sambuc printf("libminixfs: freeing; %d blocks, %d bytes\n", freed, bytes); 193433d6423SLionel Sambuc } 194433d6423SLionel Sambuc 1956c46a77dSDavid van Moolenbroek static void lmfs_alloc_block(struct buf *bp, size_t block_size) 196433d6423SLionel Sambuc { 197433d6423SLionel Sambuc int len; 198433d6423SLionel Sambuc ASSERT(!bp->data); 199433d6423SLionel Sambuc ASSERT(bp->lmfs_bytes == 0); 200433d6423SLionel Sambuc 2016c46a77dSDavid van Moolenbroek len = roundup(block_size, PAGE_SIZE); 202433d6423SLionel Sambuc 2036c46a77dSDavid van Moolenbroek if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE, 2046c46a77dSDavid van Moolenbroek MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) { 205433d6423SLionel Sambuc free_unused_blocks(); 2066c46a77dSDavid van Moolenbroek if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE, 207433d6423SLionel Sambuc MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) { 208433d6423SLionel Sambuc panic("libminixfs: could not allocate block"); 209433d6423SLionel Sambuc } 210433d6423SLionel Sambuc } 211433d6423SLionel Sambuc assert(bp->data); 2126c46a77dSDavid van Moolenbroek bp->lmfs_bytes = block_size; 213433d6423SLionel Sambuc bp->lmfs_needsetcache = 1; 214433d6423SLionel Sambuc } 215433d6423SLionel Sambuc 216433d6423SLionel Sambuc /*===========================================================================* 217433d6423SLionel Sambuc * lmfs_get_block * 218433d6423SLionel Sambuc *===========================================================================*/ 2196c46a77dSDavid van Moolenbroek int lmfs_get_block(struct buf **bpp, dev_t dev, block64_t block, int how) 220433d6423SLionel Sambuc { 2216c46a77dSDavid van Moolenbroek return lmfs_get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0); 222433d6423SLionel Sambuc } 223433d6423SLionel Sambuc 22465f76edbSDavid van Moolenbroek static void munmap_t(void *a, int len) 225433d6423SLionel Sambuc { 226433d6423SLionel Sambuc vir_bytes av = (vir_bytes) a; 227433d6423SLionel Sambuc assert(a); 228433d6423SLionel Sambuc assert(a != MAP_FAILED); 229433d6423SLionel Sambuc assert(len > 0); 230433d6423SLionel Sambuc assert(!(av % PAGE_SIZE)); 231433d6423SLionel Sambuc 232433d6423SLionel Sambuc len = roundup(len, PAGE_SIZE); 233433d6423SLionel Sambuc 234433d6423SLionel Sambuc assert(!(len % PAGE_SIZE)); 235433d6423SLionel Sambuc 236433d6423SLionel Sambuc if(munmap(a, len) < 0) 237433d6423SLionel Sambuc panic("libminixfs cache: munmap failed"); 238433d6423SLionel Sambuc } 239433d6423SLionel Sambuc 240433d6423SLionel Sambuc static void raisecount(struct buf *bp) 241433d6423SLionel Sambuc { 242433d6423SLionel Sambuc assert(bufs_in_use >= 0); 243433d6423SLionel Sambuc ASSERT(bp->lmfs_count >= 0); 244433d6423SLionel Sambuc bp->lmfs_count++; 245433d6423SLionel Sambuc if(bp->lmfs_count == 1) bufs_in_use++; 246433d6423SLionel Sambuc assert(bufs_in_use > 0); 247433d6423SLionel Sambuc } 248433d6423SLionel Sambuc 249433d6423SLionel Sambuc static void lowercount(struct buf *bp) 250433d6423SLionel Sambuc { 251433d6423SLionel Sambuc assert(bufs_in_use > 0); 252433d6423SLionel Sambuc ASSERT(bp->lmfs_count > 0); 253433d6423SLionel Sambuc bp->lmfs_count--; 254433d6423SLionel Sambuc if(bp->lmfs_count == 0) bufs_in_use--; 255433d6423SLionel Sambuc assert(bufs_in_use >= 0); 256433d6423SLionel Sambuc } 257433d6423SLionel Sambuc 258433d6423SLionel Sambuc static void freeblock(struct buf *bp) 259433d6423SLionel Sambuc { 260433d6423SLionel Sambuc ASSERT(bp->lmfs_count == 0); 261433d6423SLionel Sambuc /* If the block taken is dirty, make it clean by writing it to the disk. 262433d6423SLionel Sambuc * Avoid hysteresis by flushing all other dirty blocks for the same device. 263433d6423SLionel Sambuc */ 264433d6423SLionel Sambuc if (bp->lmfs_dev != NO_DEV) { 265ebd3c067SDavid van Moolenbroek if (!lmfs_isclean(bp)) lmfs_flushdev(bp->lmfs_dev); 2666c46a77dSDavid van Moolenbroek assert(bp->lmfs_bytes > 0); 267433d6423SLionel Sambuc bp->lmfs_dev = NO_DEV; 268433d6423SLionel Sambuc } 269433d6423SLionel Sambuc 270433d6423SLionel Sambuc /* Fill in block's parameters and add it to the hash chain where it goes. */ 271433d6423SLionel Sambuc MARKCLEAN(bp); /* NO_DEV blocks may be marked dirty */ 272433d6423SLionel Sambuc if(bp->lmfs_bytes > 0) { 273433d6423SLionel Sambuc assert(bp->data); 274433d6423SLionel Sambuc munmap_t(bp->data, bp->lmfs_bytes); 275433d6423SLionel Sambuc bp->lmfs_bytes = 0; 276433d6423SLionel Sambuc bp->data = NULL; 277433d6423SLionel Sambuc } else assert(!bp->data); 278433d6423SLionel Sambuc } 279433d6423SLionel Sambuc 280433d6423SLionel Sambuc /*===========================================================================* 281e94f856bSDavid van Moolenbroek * find_block * 282e94f856bSDavid van Moolenbroek *===========================================================================*/ 283e94f856bSDavid van Moolenbroek static struct buf *find_block(dev_t dev, block64_t block) 284e94f856bSDavid van Moolenbroek { 285e94f856bSDavid van Moolenbroek /* Search the hash chain for (dev, block). Return the buffer structure if 286e94f856bSDavid van Moolenbroek * found, or NULL otherwise. 287e94f856bSDavid van Moolenbroek */ 288e94f856bSDavid van Moolenbroek struct buf *bp; 289e94f856bSDavid van Moolenbroek int b; 290e94f856bSDavid van Moolenbroek 291e94f856bSDavid van Moolenbroek assert(dev != NO_DEV); 292e94f856bSDavid van Moolenbroek 293e94f856bSDavid van Moolenbroek b = BUFHASH(block); 294e94f856bSDavid van Moolenbroek for (bp = buf_hash[b]; bp != NULL; bp = bp->lmfs_hash) 295e94f856bSDavid van Moolenbroek if (bp->lmfs_blocknr == block && bp->lmfs_dev == dev) 296e94f856bSDavid van Moolenbroek return bp; 297e94f856bSDavid van Moolenbroek 298e94f856bSDavid van Moolenbroek return NULL; 299e94f856bSDavid van Moolenbroek } 300e94f856bSDavid van Moolenbroek 301e94f856bSDavid van Moolenbroek /*===========================================================================* 3026c46a77dSDavid van Moolenbroek * get_block_ino * 303433d6423SLionel Sambuc *===========================================================================*/ 3046c46a77dSDavid van Moolenbroek static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, 3056c46a77dSDavid van Moolenbroek ino_t ino, u64_t ino_off, size_t block_size) 306433d6423SLionel Sambuc { 3076c46a77dSDavid van Moolenbroek /* Check to see if the requested block is in the block cache. The requested 3086c46a77dSDavid van Moolenbroek * block is identified by the block number in 'block' on device 'dev', counted 3096c46a77dSDavid van Moolenbroek * in the file system block size. The amount of data requested for this block 3106c46a77dSDavid van Moolenbroek * is given in 'block_size', which may be less than the file system block size 3116c46a77dSDavid van Moolenbroek * iff the requested block is the last (partial) block on a device. Note that 3126c46a77dSDavid van Moolenbroek * the given block size does *not* affect the conversion of 'block' to a byte 3136c46a77dSDavid van Moolenbroek * offset! Either way, if the block could be obtained, either from the cache 3146c46a77dSDavid van Moolenbroek * or by reading from the device, return OK, with a pointer to the buffer 3156c46a77dSDavid van Moolenbroek * structure stored in 'bpp'. If not, return a negative error code (and no 3166c46a77dSDavid van Moolenbroek * buffer). If necessary, evict some other block and fetch the contents from 3176c46a77dSDavid van Moolenbroek * disk (if 'how' is NORMAL). If 'how' is NO_READ, the caller intends to 3186c46a77dSDavid van Moolenbroek * overwrite the requested block in its entirety, so it is only necessary to 3196c46a77dSDavid van Moolenbroek * see if it is in the cache; if it is not, any free buffer will do. If 'how' 3204472b590SDavid van Moolenbroek * is PEEK, the function returns the block if it is in the cache or the VM 3214472b590SDavid van Moolenbroek * cache, and an ENOENT error code otherwise. 322433d6423SLionel Sambuc * In addition to the LRU chain, there is also a hash chain to link together 323433d6423SLionel Sambuc * blocks whose block numbers end with the same bit strings, for fast lookup. 324433d6423SLionel Sambuc */ 3256c46a77dSDavid van Moolenbroek int b, r; 326433d6423SLionel Sambuc static struct buf *bp; 327b65ad59eSDavid van Moolenbroek uint64_t dev_off; 328433d6423SLionel Sambuc struct buf *prev_ptr; 329433d6423SLionel Sambuc 330433d6423SLionel Sambuc assert(buf_hash); 331433d6423SLionel Sambuc assert(buf); 332433d6423SLionel Sambuc assert(nr_bufs > 0); 333433d6423SLionel Sambuc 334433d6423SLionel Sambuc ASSERT(fs_block_size > 0); 335433d6423SLionel Sambuc 336433d6423SLionel Sambuc assert(dev != NO_DEV); 337433d6423SLionel Sambuc 338b65ad59eSDavid van Moolenbroek assert(block <= UINT64_MAX / fs_block_size); 339b65ad59eSDavid van Moolenbroek 340b65ad59eSDavid van Moolenbroek dev_off = block * fs_block_size; 341b65ad59eSDavid van Moolenbroek 342433d6423SLionel Sambuc if((ino_off % fs_block_size)) { 343433d6423SLionel Sambuc 344433d6423SLionel Sambuc printf("cache: unaligned lmfs_get_block_ino ino_off %llu\n", 345433d6423SLionel Sambuc ino_off); 346433d6423SLionel Sambuc util_stacktrace(); 347433d6423SLionel Sambuc } 348433d6423SLionel Sambuc 349e94f856bSDavid van Moolenbroek /* See if the block is in the cache. If so, we can return it right away. */ 350e94f856bSDavid van Moolenbroek bp = find_block(dev, block); 351e94f856bSDavid van Moolenbroek if (bp != NULL && !(bp->lmfs_flags & VMMC_EVICTED)) { 3526c46a77dSDavid van Moolenbroek ASSERT(bp->lmfs_dev == dev); 3536c46a77dSDavid van Moolenbroek ASSERT(bp->lmfs_dev != NO_DEV); 3546c46a77dSDavid van Moolenbroek 3556c46a77dSDavid van Moolenbroek /* The block must have exactly the requested number of bytes. */ 3566c46a77dSDavid van Moolenbroek if (bp->lmfs_bytes != block_size) 3576c46a77dSDavid van Moolenbroek return EIO; 3586c46a77dSDavid van Moolenbroek 359433d6423SLionel Sambuc /* Block needed has been found. */ 360433d6423SLionel Sambuc if (bp->lmfs_count == 0) { 361433d6423SLionel Sambuc rm_lru(bp); 362433d6423SLionel Sambuc ASSERT(bp->lmfs_needsetcache == 0); 363433d6423SLionel Sambuc ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED)); 364e94f856bSDavid van Moolenbroek /* FIXME: race condition against the VMMC_EVICTED check */ 365433d6423SLionel Sambuc bp->lmfs_flags |= VMMC_BLOCK_LOCKED; 366433d6423SLionel Sambuc } 367433d6423SLionel Sambuc raisecount(bp); 368433d6423SLionel Sambuc ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED); 369433d6423SLionel Sambuc ASSERT(bp->data); 370433d6423SLionel Sambuc 371433d6423SLionel Sambuc if(ino != VMC_NO_INODE) { 372433d6423SLionel Sambuc if(bp->lmfs_inode == VMC_NO_INODE 373433d6423SLionel Sambuc || bp->lmfs_inode != ino 374433d6423SLionel Sambuc || bp->lmfs_inode_offset != ino_off) { 375433d6423SLionel Sambuc bp->lmfs_inode = ino; 376433d6423SLionel Sambuc bp->lmfs_inode_offset = ino_off; 377433d6423SLionel Sambuc bp->lmfs_needsetcache = 1; 378433d6423SLionel Sambuc } 379433d6423SLionel Sambuc } 380433d6423SLionel Sambuc 3816c46a77dSDavid van Moolenbroek *bpp = bp; 3826c46a77dSDavid van Moolenbroek return OK; 383433d6423SLionel Sambuc } 384e94f856bSDavid van Moolenbroek 385e94f856bSDavid van Moolenbroek /* We had the block in the cache but VM evicted it; invalidate it. */ 386e94f856bSDavid van Moolenbroek if (bp != NULL) { 387e94f856bSDavid van Moolenbroek assert(bp->lmfs_flags & VMMC_EVICTED); 388e94f856bSDavid van Moolenbroek ASSERT(bp->lmfs_count == 0); 389e94f856bSDavid van Moolenbroek ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED)); 390e94f856bSDavid van Moolenbroek ASSERT(!(bp->lmfs_flags & VMMC_DIRTY)); 391e94f856bSDavid van Moolenbroek bp->lmfs_dev = NO_DEV; 392e94f856bSDavid van Moolenbroek bp->lmfs_bytes = 0; 393e94f856bSDavid van Moolenbroek bp->data = NULL; 394433d6423SLionel Sambuc } 395433d6423SLionel Sambuc 396433d6423SLionel Sambuc /* Desired block is not on available chain. Find a free block to use. */ 397433d6423SLionel Sambuc if(bp) { 398433d6423SLionel Sambuc ASSERT(bp->lmfs_flags & VMMC_EVICTED); 399433d6423SLionel Sambuc } else { 400433d6423SLionel Sambuc if ((bp = front) == NULL) panic("all buffers in use: %d", nr_bufs); 401433d6423SLionel Sambuc } 402433d6423SLionel Sambuc assert(bp); 403433d6423SLionel Sambuc 404433d6423SLionel Sambuc rm_lru(bp); 405433d6423SLionel Sambuc 406433d6423SLionel Sambuc /* Remove the block that was just taken from its hash chain. */ 407433d6423SLionel Sambuc b = BUFHASH(bp->lmfs_blocknr); 408433d6423SLionel Sambuc prev_ptr = buf_hash[b]; 409433d6423SLionel Sambuc if (prev_ptr == bp) { 410433d6423SLionel Sambuc buf_hash[b] = bp->lmfs_hash; 411433d6423SLionel Sambuc } else { 412433d6423SLionel Sambuc /* The block just taken is not on the front of its hash chain. */ 413433d6423SLionel Sambuc while (prev_ptr->lmfs_hash != NULL) 414433d6423SLionel Sambuc if (prev_ptr->lmfs_hash == bp) { 415433d6423SLionel Sambuc prev_ptr->lmfs_hash = bp->lmfs_hash; /* found it */ 416433d6423SLionel Sambuc break; 417433d6423SLionel Sambuc } else { 418433d6423SLionel Sambuc prev_ptr = prev_ptr->lmfs_hash; /* keep looking */ 419433d6423SLionel Sambuc } 420433d6423SLionel Sambuc } 421433d6423SLionel Sambuc 422433d6423SLionel Sambuc freeblock(bp); 423433d6423SLionel Sambuc 424433d6423SLionel Sambuc bp->lmfs_inode = ino; 425433d6423SLionel Sambuc bp->lmfs_inode_offset = ino_off; 426433d6423SLionel Sambuc 427433d6423SLionel Sambuc bp->lmfs_flags = VMMC_BLOCK_LOCKED; 428433d6423SLionel Sambuc bp->lmfs_needsetcache = 0; 429433d6423SLionel Sambuc bp->lmfs_dev = dev; /* fill in device number */ 430433d6423SLionel Sambuc bp->lmfs_blocknr = block; /* fill in block number */ 431433d6423SLionel Sambuc ASSERT(bp->lmfs_count == 0); 432433d6423SLionel Sambuc raisecount(bp); 433433d6423SLionel Sambuc b = BUFHASH(bp->lmfs_blocknr); 434433d6423SLionel Sambuc bp->lmfs_hash = buf_hash[b]; 435433d6423SLionel Sambuc 436433d6423SLionel Sambuc buf_hash[b] = bp; /* add to hash list */ 437433d6423SLionel Sambuc 438433d6423SLionel Sambuc assert(dev != NO_DEV); 439433d6423SLionel Sambuc 4404472b590SDavid van Moolenbroek /* The block is not found in our cache, but we do want it if it's in the VM 4414472b590SDavid van Moolenbroek * cache. The exception is NO_READ, purely for context switching performance 4424472b590SDavid van Moolenbroek * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being 4434472b590SDavid van Moolenbroek * prefetched, and 3) blocks about to be fully overwritten. In the first two 4444472b590SDavid van Moolenbroek * cases, VM will not have the block in its cache anyway, and for the third 4454472b590SDavid van Moolenbroek * we save on one VM call only if the block is in the VM cache. 446433d6423SLionel Sambuc */ 447433d6423SLionel Sambuc assert(!bp->data); 448433d6423SLionel Sambuc assert(!bp->lmfs_bytes); 4494472b590SDavid van Moolenbroek if (how != NO_READ && vmcache) { 450433d6423SLionel Sambuc if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off, 4516c46a77dSDavid van Moolenbroek &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) { 4526c46a77dSDavid van Moolenbroek bp->lmfs_bytes = block_size; 453433d6423SLionel Sambuc ASSERT(!bp->lmfs_needsetcache); 4546c46a77dSDavid van Moolenbroek *bpp = bp; 4556c46a77dSDavid van Moolenbroek return OK; 456433d6423SLionel Sambuc } 457433d6423SLionel Sambuc } 458433d6423SLionel Sambuc bp->data = NULL; 459433d6423SLionel Sambuc 460cb9453caSDavid van Moolenbroek /* The block is not in the cache, and VM does not know about it. If we were 461cb9453caSDavid van Moolenbroek * requested to search for the block only, we can now return failure to the 462cb9453caSDavid van Moolenbroek * caller. Return the block to the pool without allocating data pages, since 463cb9453caSDavid van Moolenbroek * these would be freed upon recycling the block anyway. 464cb9453caSDavid van Moolenbroek */ 465cb9453caSDavid van Moolenbroek if (how == PEEK) { 466cb9453caSDavid van Moolenbroek bp->lmfs_dev = NO_DEV; 467cb9453caSDavid van Moolenbroek 4680314acfbSDavid van Moolenbroek put_block(bp, ONE_SHOT); 469cb9453caSDavid van Moolenbroek 4706c46a77dSDavid van Moolenbroek return ENOENT; 471cb9453caSDavid van Moolenbroek } 472cb9453caSDavid van Moolenbroek 473433d6423SLionel Sambuc /* Not in the cache; reserve memory for its contents. */ 474433d6423SLionel Sambuc 4756c46a77dSDavid van Moolenbroek lmfs_alloc_block(bp, block_size); 476433d6423SLionel Sambuc 477433d6423SLionel Sambuc assert(bp->data); 478433d6423SLionel Sambuc 4794472b590SDavid van Moolenbroek if (how == NORMAL) { 4806c46a77dSDavid van Moolenbroek /* Try to read the block. Return an error code on failure. */ 4816c46a77dSDavid van Moolenbroek if ((r = read_block(bp, block_size)) != OK) { 4826c46a77dSDavid van Moolenbroek put_block(bp, 0); 4836c46a77dSDavid van Moolenbroek 4846c46a77dSDavid van Moolenbroek return r; 4856c46a77dSDavid van Moolenbroek } 486cb9453caSDavid van Moolenbroek } else if(how == NO_READ) { 487433d6423SLionel Sambuc /* This block will be overwritten by new contents. */ 488433d6423SLionel Sambuc } else 489cb9453caSDavid van Moolenbroek panic("unexpected 'how' value: %d", how); 490433d6423SLionel Sambuc 491433d6423SLionel Sambuc assert(bp->data); 492433d6423SLionel Sambuc 4936c46a77dSDavid van Moolenbroek *bpp = bp; /* return the newly acquired block */ 4946c46a77dSDavid van Moolenbroek return OK; 4956c46a77dSDavid van Moolenbroek } 4966c46a77dSDavid van Moolenbroek 4976c46a77dSDavid van Moolenbroek /*===========================================================================* 4986c46a77dSDavid van Moolenbroek * lmfs_get_block_ino * 4996c46a77dSDavid van Moolenbroek *===========================================================================*/ 5006c46a77dSDavid van Moolenbroek int lmfs_get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, 5016c46a77dSDavid van Moolenbroek ino_t ino, u64_t ino_off) 5026c46a77dSDavid van Moolenbroek { 5036c46a77dSDavid van Moolenbroek return get_block_ino(bpp, dev, block, how, ino, ino_off, fs_block_size); 5046c46a77dSDavid van Moolenbroek } 5056c46a77dSDavid van Moolenbroek 5066c46a77dSDavid van Moolenbroek /*===========================================================================* 5076c46a77dSDavid van Moolenbroek * lmfs_get_partial_block * 5086c46a77dSDavid van Moolenbroek *===========================================================================*/ 5096c46a77dSDavid van Moolenbroek int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block, 5106c46a77dSDavid van Moolenbroek int how, size_t block_size) 5116c46a77dSDavid van Moolenbroek { 5126c46a77dSDavid van Moolenbroek return get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0, block_size); 513433d6423SLionel Sambuc } 514433d6423SLionel Sambuc 515433d6423SLionel Sambuc /*===========================================================================* 5160314acfbSDavid van Moolenbroek * put_block * 517433d6423SLionel Sambuc *===========================================================================*/ 5180314acfbSDavid van Moolenbroek static void put_block(struct buf *bp, int put_flags) 519433d6423SLionel Sambuc { 5200314acfbSDavid van Moolenbroek /* Return a block to the list of available blocks. Depending on 'put_flags' 521433d6423SLionel Sambuc * it may be put on the front or rear of the LRU chain. Blocks that are 5220314acfbSDavid van Moolenbroek * expected to be needed again at some point go on the rear; blocks that are 5230314acfbSDavid van Moolenbroek * unlikely to be needed again at all go on the front. 524433d6423SLionel Sambuc */ 525433d6423SLionel Sambuc dev_t dev; 526b65ad59eSDavid van Moolenbroek uint64_t dev_off; 527d75faf18SDavid van Moolenbroek int r, setflags; 528433d6423SLionel Sambuc 5290314acfbSDavid van Moolenbroek assert(bp != NULL); 530433d6423SLionel Sambuc 531433d6423SLionel Sambuc dev = bp->lmfs_dev; 532433d6423SLionel Sambuc 533b65ad59eSDavid van Moolenbroek dev_off = bp->lmfs_blocknr * fs_block_size; 534433d6423SLionel Sambuc 535433d6423SLionel Sambuc lowercount(bp); 536433d6423SLionel Sambuc if (bp->lmfs_count != 0) return; /* block is still in use */ 537433d6423SLionel Sambuc 538433d6423SLionel Sambuc /* Put this block back on the LRU chain. */ 5390314acfbSDavid van Moolenbroek if (dev == NO_DEV || dev == DEV_RAM || (put_flags & ONE_SHOT)) { 5400314acfbSDavid van Moolenbroek /* Block will not be needed again. Put it on front of chain. 541433d6423SLionel Sambuc * It will be the next block to be evicted from the cache. 542433d6423SLionel Sambuc */ 543433d6423SLionel Sambuc bp->lmfs_prev = NULL; 544433d6423SLionel Sambuc bp->lmfs_next = front; 545433d6423SLionel Sambuc if (front == NULL) 546433d6423SLionel Sambuc rear = bp; /* LRU chain was empty */ 547433d6423SLionel Sambuc else 548433d6423SLionel Sambuc front->lmfs_prev = bp; 549433d6423SLionel Sambuc front = bp; 550433d6423SLionel Sambuc } 551433d6423SLionel Sambuc else { 5520314acfbSDavid van Moolenbroek /* Block may be needed again. Put it on rear of chain. 553433d6423SLionel Sambuc * It will not be evicted from the cache for a long time. 554433d6423SLionel Sambuc */ 555433d6423SLionel Sambuc bp->lmfs_prev = rear; 556433d6423SLionel Sambuc bp->lmfs_next = NULL; 557433d6423SLionel Sambuc if (rear == NULL) 558433d6423SLionel Sambuc front = bp; 559433d6423SLionel Sambuc else 560433d6423SLionel Sambuc rear->lmfs_next = bp; 561433d6423SLionel Sambuc rear = bp; 562433d6423SLionel Sambuc } 563433d6423SLionel Sambuc 564433d6423SLionel Sambuc assert(bp->lmfs_flags & VMMC_BLOCK_LOCKED); 565433d6423SLionel Sambuc bp->lmfs_flags &= ~VMMC_BLOCK_LOCKED; 566433d6423SLionel Sambuc 567cb9453caSDavid van Moolenbroek /* block has sensible content - if necessary, identify it to VM */ 568433d6423SLionel Sambuc if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) { 569cb9453caSDavid van Moolenbroek assert(bp->data); 570cb9453caSDavid van Moolenbroek 5710314acfbSDavid van Moolenbroek setflags = (put_flags & ONE_SHOT) ? VMSF_ONCE : 0; 5726c46a77dSDavid van Moolenbroek 573d75faf18SDavid van Moolenbroek if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode, 5746c46a77dSDavid van Moolenbroek bp->lmfs_inode_offset, &bp->lmfs_flags, 5756c46a77dSDavid van Moolenbroek roundup(bp->lmfs_bytes, PAGE_SIZE), setflags)) != OK) { 576433d6423SLionel Sambuc if(r == ENOSYS) { 577433d6423SLionel Sambuc printf("libminixfs: ENOSYS, disabling VM calls\n"); 578433d6423SLionel Sambuc vmcache = 0; 5796c46a77dSDavid van Moolenbroek } else if (r == ENOMEM) { 5806c46a77dSDavid van Moolenbroek /* Do not panic in this case. Running out of memory is 5816c46a77dSDavid van Moolenbroek * bad, especially since it may lead to applications 5826c46a77dSDavid van Moolenbroek * crashing when trying to access memory-mapped pages 5836c46a77dSDavid van Moolenbroek * we haven't been able to pass off to the VM cache, 5846c46a77dSDavid van Moolenbroek * but the entire file system crashing is always worse. 5856c46a77dSDavid van Moolenbroek */ 5866c46a77dSDavid van Moolenbroek printf("libminixfs: no memory for cache block!\n"); 587433d6423SLionel Sambuc } else { 588433d6423SLionel Sambuc panic("libminixfs: setblock of %p dev 0x%llx off " 589433d6423SLionel Sambuc "0x%llx failed\n", bp->data, dev, dev_off); 590433d6423SLionel Sambuc } 591433d6423SLionel Sambuc } 592433d6423SLionel Sambuc } 593433d6423SLionel Sambuc bp->lmfs_needsetcache = 0; 594d75faf18SDavid van Moolenbroek 595d75faf18SDavid van Moolenbroek /* Now that we (may) have given the block to VM, invalidate the block if it 596d75faf18SDavid van Moolenbroek * is a one-shot block. Otherwise, it may still be reobtained immediately 597d75faf18SDavid van Moolenbroek * after, which could be a problem if VM already forgot the block and we are 598d75faf18SDavid van Moolenbroek * expected to pass it to VM again, which then wouldn't happen. 599d75faf18SDavid van Moolenbroek */ 6000314acfbSDavid van Moolenbroek if (put_flags & ONE_SHOT) 601d75faf18SDavid van Moolenbroek bp->lmfs_dev = NO_DEV; 602e94f856bSDavid van Moolenbroek } 603433d6423SLionel Sambuc 604e94f856bSDavid van Moolenbroek /*===========================================================================* 6050314acfbSDavid van Moolenbroek * lmfs_put_block * 6060314acfbSDavid van Moolenbroek *===========================================================================*/ 6070314acfbSDavid van Moolenbroek void lmfs_put_block(struct buf *bp) 6080314acfbSDavid van Moolenbroek { 6090314acfbSDavid van Moolenbroek /* User interface to put_block(). */ 6100314acfbSDavid van Moolenbroek 6110314acfbSDavid van Moolenbroek if (bp == NULL) return; /* for poorly written file systems */ 6120314acfbSDavid van Moolenbroek 6130314acfbSDavid van Moolenbroek put_block(bp, 0); 6140314acfbSDavid van Moolenbroek } 6150314acfbSDavid van Moolenbroek 6160314acfbSDavid van Moolenbroek /*===========================================================================* 617e94f856bSDavid van Moolenbroek * lmfs_free_block * 618e94f856bSDavid van Moolenbroek *===========================================================================*/ 619e94f856bSDavid van Moolenbroek void lmfs_free_block(dev_t dev, block64_t block) 620e94f856bSDavid van Moolenbroek { 621e94f856bSDavid van Moolenbroek /* The file system has just freed the given block. The block may previously 622e94f856bSDavid van Moolenbroek * have been in use as data block for an inode. Therefore, we now need to tell 623e94f856bSDavid van Moolenbroek * VM that the block is no longer associated with an inode. If we fail to do so 624e94f856bSDavid van Moolenbroek * and the inode now has a hole at this location, mapping in the hole would 625e94f856bSDavid van Moolenbroek * yield the old block contents rather than a zeroed page. In addition, if the 626e94f856bSDavid van Moolenbroek * block is in the cache, it will be removed, even if it was dirty. 627e94f856bSDavid van Moolenbroek */ 628e94f856bSDavid van Moolenbroek struct buf *bp; 629e94f856bSDavid van Moolenbroek int r; 630e94f856bSDavid van Moolenbroek 631e94f856bSDavid van Moolenbroek /* Tell VM to forget about the block. The primary purpose of this call is to 632e94f856bSDavid van Moolenbroek * break the inode association, but since the block is part of a mounted file 633e94f856bSDavid van Moolenbroek * system, it is not expected to be accessed directly anyway. So, save some 634e94f856bSDavid van Moolenbroek * cache memory by throwing it out of the VM cache altogether. 635e94f856bSDavid van Moolenbroek */ 636e94f856bSDavid van Moolenbroek if (vmcache) { 637e94f856bSDavid van Moolenbroek if ((r = vm_forget_cacheblock(dev, block * fs_block_size, 638e94f856bSDavid van Moolenbroek fs_block_size)) != OK) 639e94f856bSDavid van Moolenbroek printf("libminixfs: vm_forget_cacheblock failed (%d)\n", r); 640e94f856bSDavid van Moolenbroek } 641e94f856bSDavid van Moolenbroek 642e94f856bSDavid van Moolenbroek if ((bp = find_block(dev, block)) != NULL) { 643e94f856bSDavid van Moolenbroek lmfs_markclean(bp); 644e94f856bSDavid van Moolenbroek 645e94f856bSDavid van Moolenbroek /* Invalidate the block. The block may or may not be in use right now, 646e94f856bSDavid van Moolenbroek * so don't be smart about freeing memory or repositioning in the LRU. 647e94f856bSDavid van Moolenbroek */ 648e94f856bSDavid van Moolenbroek bp->lmfs_dev = NO_DEV; 649e94f856bSDavid van Moolenbroek } 650e94f856bSDavid van Moolenbroek 651e94f856bSDavid van Moolenbroek /* Note that this is *not* the right place to implement TRIM support. Even 652e94f856bSDavid van Moolenbroek * though the block is freed, on the device it may still be part of a 653e94f856bSDavid van Moolenbroek * previous checkpoint or snapshot of some sort. Only the file system can 654e94f856bSDavid van Moolenbroek * be trusted to decide which blocks can be reused on the device! 655e94f856bSDavid van Moolenbroek */ 656433d6423SLionel Sambuc } 657433d6423SLionel Sambuc 658d75faf18SDavid van Moolenbroek /*===========================================================================* 659d75faf18SDavid van Moolenbroek * lmfs_zero_block_ino * 660d75faf18SDavid van Moolenbroek *===========================================================================*/ 661d75faf18SDavid van Moolenbroek void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off) 662d75faf18SDavid van Moolenbroek { 663d75faf18SDavid van Moolenbroek /* Files may have holes. From an application perspective, these are just file 664d75faf18SDavid van Moolenbroek * regions filled with zeroes. From a file system perspective however, holes 665d75faf18SDavid van Moolenbroek * may represent unallocated regions on disk. Thus, these holes do not have 666d75faf18SDavid van Moolenbroek * corresponding blocks on the disk, and therefore also no block number. 667d75faf18SDavid van Moolenbroek * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads, 668d75faf18SDavid van Moolenbroek * this is not a problem, since the file system can just zero out the target 669d75faf18SDavid van Moolenbroek * application buffer instead. For mapped pages however, this *is* a problem, 670d75faf18SDavid van Moolenbroek * since the VM cache needs to be told about the corresponding block, and VM 671d75faf18SDavid van Moolenbroek * does not accept blocks without a device offset. The role of this function is 672d75faf18SDavid van Moolenbroek * therefore to tell VM about the hole using a fake device offset. The device 673d75faf18SDavid van Moolenbroek * offsets are picked so that the VM cache will see a block memory-mapped for 674d75faf18SDavid van Moolenbroek * the hole in the file, while the same block is not visible when 675d75faf18SDavid van Moolenbroek * memory-mapping the block device. 676d75faf18SDavid van Moolenbroek */ 677d75faf18SDavid van Moolenbroek struct buf *bp; 678d75faf18SDavid van Moolenbroek static block64_t fake_block = 0; 6796c46a77dSDavid van Moolenbroek int r; 680d75faf18SDavid van Moolenbroek 681d75faf18SDavid van Moolenbroek if (!vmcache) 682d75faf18SDavid van Moolenbroek return; 683d75faf18SDavid van Moolenbroek 684d75faf18SDavid van Moolenbroek assert(fs_block_size > 0); 685d75faf18SDavid van Moolenbroek 686d75faf18SDavid van Moolenbroek /* Pick a block number which is above the threshold of what can possibly be 687d75faf18SDavid van Moolenbroek * mapped in by mmap'ing the device, since off_t is signed, and it is safe to 688d75faf18SDavid van Moolenbroek * say that it will take a while before we have 8-exabyte devices. Pick a 689d75faf18SDavid van Moolenbroek * different block number each time to avoid possible concurrency issues. 690d75faf18SDavid van Moolenbroek * FIXME: it does not seem like VM actually verifies mmap offsets though.. 691d75faf18SDavid van Moolenbroek */ 692d75faf18SDavid van Moolenbroek if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size) 693d75faf18SDavid van Moolenbroek fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size; 694d75faf18SDavid van Moolenbroek 695d75faf18SDavid van Moolenbroek /* Obtain a block. */ 6966c46a77dSDavid van Moolenbroek if ((r = lmfs_get_block_ino(&bp, dev, fake_block, NO_READ, ino, 6976c46a77dSDavid van Moolenbroek ino_off)) != OK) 6986c46a77dSDavid van Moolenbroek panic("libminixfs: getting a NO_READ block failed: %d", r); 699d75faf18SDavid van Moolenbroek assert(bp != NULL); 700d75faf18SDavid van Moolenbroek assert(bp->lmfs_dev != NO_DEV); 701d75faf18SDavid van Moolenbroek 702d75faf18SDavid van Moolenbroek /* The block is already zeroed, as it has just been allocated with mmap. File 703d75faf18SDavid van Moolenbroek * systems do not rely on this assumption yet, so if VM ever gets changed to 704d75faf18SDavid van Moolenbroek * not clear the blocks we allocate (e.g., by recycling pages in the VM cache 705d75faf18SDavid van Moolenbroek * for the same process, which would be safe), we need to add a memset here. 706d75faf18SDavid van Moolenbroek */ 707d75faf18SDavid van Moolenbroek 708d75faf18SDavid van Moolenbroek /* Release the block. We don't expect it to be accessed ever again. Moreover, 709d75faf18SDavid van Moolenbroek * if we keep the block around in the VM cache, it may erroneously be mapped 710d75faf18SDavid van Moolenbroek * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM. 711d75faf18SDavid van Moolenbroek * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate 712d75faf18SDavid van Moolenbroek * all such pages in its cache. 713d75faf18SDavid van Moolenbroek */ 7140314acfbSDavid van Moolenbroek put_block(bp, ONE_SHOT); 715d75faf18SDavid van Moolenbroek } 716d75faf18SDavid van Moolenbroek 7171311233cSDavid van Moolenbroek void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused) 718433d6423SLionel Sambuc { 7191311233cSDavid van Moolenbroek 7201311233cSDavid van Moolenbroek assert(bused <= btotal); 7211311233cSDavid van Moolenbroek fs_btotal = btotal; 7221311233cSDavid van Moolenbroek fs_bused = bused; 7231311233cSDavid van Moolenbroek 7241311233cSDavid van Moolenbroek /* if the cache isn't in use, we could resize it. */ 7251311233cSDavid van Moolenbroek if (bufs_in_use == 0) 7260314acfbSDavid van Moolenbroek cache_heuristic_check(); 727433d6423SLionel Sambuc } 728433d6423SLionel Sambuc 729433d6423SLionel Sambuc /*===========================================================================* 730433d6423SLionel Sambuc * read_block * 731433d6423SLionel Sambuc *===========================================================================*/ 7326c46a77dSDavid van Moolenbroek static int read_block(struct buf *bp, size_t block_size) 733433d6423SLionel Sambuc { 7346c46a77dSDavid van Moolenbroek /* Read a disk block of 'size' bytes. The given size is always the FS block 7356c46a77dSDavid van Moolenbroek * size, except for the last block of a device. If an I/O error occurs, 7366c46a77dSDavid van Moolenbroek * invalidate the block and return an error code. 737433d6423SLionel Sambuc */ 7386c46a77dSDavid van Moolenbroek ssize_t r; 739433d6423SLionel Sambuc off_t pos; 740433d6423SLionel Sambuc dev_t dev = bp->lmfs_dev; 741433d6423SLionel Sambuc 742433d6423SLionel Sambuc assert(dev != NO_DEV); 743433d6423SLionel Sambuc 7446c46a77dSDavid van Moolenbroek ASSERT(bp->lmfs_bytes == block_size); 745433d6423SLionel Sambuc ASSERT(fs_block_size > 0); 746433d6423SLionel Sambuc 747433d6423SLionel Sambuc pos = (off_t)bp->lmfs_blocknr * fs_block_size; 7486c46a77dSDavid van Moolenbroek if (block_size > PAGE_SIZE) { 749433d6423SLionel Sambuc #define MAXPAGES 20 750433d6423SLionel Sambuc vir_bytes blockrem, vaddr = (vir_bytes) bp->data; 751433d6423SLionel Sambuc int p = 0; 752433d6423SLionel Sambuc static iovec_t iovec[MAXPAGES]; 7536c46a77dSDavid van Moolenbroek blockrem = block_size; 754433d6423SLionel Sambuc while(blockrem > 0) { 755433d6423SLionel Sambuc vir_bytes chunk = blockrem >= PAGE_SIZE ? PAGE_SIZE : blockrem; 756433d6423SLionel Sambuc iovec[p].iov_addr = vaddr; 757433d6423SLionel Sambuc iovec[p].iov_size = chunk; 758433d6423SLionel Sambuc vaddr += chunk; 759433d6423SLionel Sambuc blockrem -= chunk; 760433d6423SLionel Sambuc p++; 761433d6423SLionel Sambuc } 762433d6423SLionel Sambuc r = bdev_gather(dev, pos, iovec, p, BDEV_NOFLAGS); 763433d6423SLionel Sambuc } else { 7646c46a77dSDavid van Moolenbroek r = bdev_read(dev, pos, bp->data, block_size, BDEV_NOFLAGS); 765433d6423SLionel Sambuc } 7666c46a77dSDavid van Moolenbroek if (r != (ssize_t)block_size) { 7676c46a77dSDavid van Moolenbroek printf("fs cache: I/O error on device %d/%d, block %"PRIu64" (%zd)\n", 7686c46a77dSDavid van Moolenbroek major(dev), minor(dev), bp->lmfs_blocknr, r); 7696c46a77dSDavid van Moolenbroek if (r >= 0) 7706c46a77dSDavid van Moolenbroek r = EIO; /* TODO: retry retrieving (just) the remaining part */ 771433d6423SLionel Sambuc 772433d6423SLionel Sambuc bp->lmfs_dev = NO_DEV; /* invalidate block */ 773433d6423SLionel Sambuc 7746c46a77dSDavid van Moolenbroek return r; 775433d6423SLionel Sambuc } 776433d6423SLionel Sambuc 7776c46a77dSDavid van Moolenbroek return OK; 778433d6423SLionel Sambuc } 779433d6423SLionel Sambuc 780433d6423SLionel Sambuc /*===========================================================================* 781433d6423SLionel Sambuc * lmfs_invalidate * 782433d6423SLionel Sambuc *===========================================================================*/ 783433d6423SLionel Sambuc void lmfs_invalidate( 784433d6423SLionel Sambuc dev_t device /* device whose blocks are to be purged */ 785433d6423SLionel Sambuc ) 786433d6423SLionel Sambuc { 787433d6423SLionel Sambuc /* Remove all the blocks belonging to some device from the cache. */ 788433d6423SLionel Sambuc 789433d6423SLionel Sambuc register struct buf *bp; 790433d6423SLionel Sambuc 791cb9453caSDavid van Moolenbroek assert(device != NO_DEV); 792cb9453caSDavid van Moolenbroek 793433d6423SLionel Sambuc for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) { 794433d6423SLionel Sambuc if (bp->lmfs_dev == device) { 795433d6423SLionel Sambuc assert(bp->data); 796433d6423SLionel Sambuc assert(bp->lmfs_bytes > 0); 797433d6423SLionel Sambuc munmap_t(bp->data, bp->lmfs_bytes); 798433d6423SLionel Sambuc bp->lmfs_dev = NO_DEV; 799433d6423SLionel Sambuc bp->lmfs_bytes = 0; 800433d6423SLionel Sambuc bp->data = NULL; 801433d6423SLionel Sambuc } 802433d6423SLionel Sambuc } 803433d6423SLionel Sambuc 804e94f856bSDavid van Moolenbroek /* Clear the cache even if VM caching is disabled for the file system: 805e94f856bSDavid van Moolenbroek * caching may be disabled as side effect of an error, leaving blocks behind 806e94f856bSDavid van Moolenbroek * in the actual VM cache. 807e94f856bSDavid van Moolenbroek */ 808433d6423SLionel Sambuc vm_clear_cache(device); 809433d6423SLionel Sambuc } 810433d6423SLionel Sambuc 811433d6423SLionel Sambuc /*===========================================================================* 8124472b590SDavid van Moolenbroek * sort_blocks * 813433d6423SLionel Sambuc *===========================================================================*/ 8144472b590SDavid van Moolenbroek static void sort_blocks(struct buf **bufq, unsigned int bufqsize) 815433d6423SLionel Sambuc { 8164472b590SDavid van Moolenbroek struct buf *bp; 8174472b590SDavid van Moolenbroek int i, j, gap; 818433d6423SLionel Sambuc 8194472b590SDavid van Moolenbroek gap = 1; 8204472b590SDavid van Moolenbroek do 8214472b590SDavid van Moolenbroek gap = 3 * gap + 1; 8224472b590SDavid van Moolenbroek while ((unsigned int)gap <= bufqsize); 823433d6423SLionel Sambuc 8244472b590SDavid van Moolenbroek while (gap != 1) { 8254472b590SDavid van Moolenbroek gap /= 3; 8264472b590SDavid van Moolenbroek for (j = gap; (unsigned int)j < bufqsize; j++) { 8274472b590SDavid van Moolenbroek for (i = j - gap; i >= 0 && 8284472b590SDavid van Moolenbroek bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr; 8294472b590SDavid van Moolenbroek i -= gap) { 8304472b590SDavid van Moolenbroek bp = bufq[i]; 8314472b590SDavid van Moolenbroek bufq[i] = bufq[i + gap]; 8324472b590SDavid van Moolenbroek bufq[i + gap] = bp; 833433d6423SLionel Sambuc } 834433d6423SLionel Sambuc } 8354472b590SDavid van Moolenbroek } 836433d6423SLionel Sambuc } 837433d6423SLionel Sambuc 838433d6423SLionel Sambuc /*===========================================================================* 8394472b590SDavid van Moolenbroek * rw_scattered * 840433d6423SLionel Sambuc *===========================================================================*/ 8414472b590SDavid van Moolenbroek static void rw_scattered( 842433d6423SLionel Sambuc dev_t dev, /* major-minor device number */ 843433d6423SLionel Sambuc struct buf **bufq, /* pointer to array of buffers */ 8444472b590SDavid van Moolenbroek unsigned int bufqsize, /* number of buffers */ 845433d6423SLionel Sambuc int rw_flag /* READING or WRITING */ 846433d6423SLionel Sambuc ) 847433d6423SLionel Sambuc { 848433d6423SLionel Sambuc /* Read or write scattered data from a device. */ 849433d6423SLionel Sambuc 850433d6423SLionel Sambuc register struct buf *bp; 851433d6423SLionel Sambuc register iovec_t *iop; 852433d6423SLionel Sambuc static iovec_t iovec[NR_IOREQS]; 853433d6423SLionel Sambuc off_t pos; 8544472b590SDavid van Moolenbroek unsigned int i, iov_per_block; 85565f76edbSDavid van Moolenbroek unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize; 856433d6423SLionel Sambuc 857433d6423SLionel Sambuc if(bufqsize == 0) return; 858433d6423SLionel Sambuc 859433d6423SLionel Sambuc /* for READING, check all buffers on the list are obtained and held 860433d6423SLionel Sambuc * (count > 0) 861433d6423SLionel Sambuc */ 862433d6423SLionel Sambuc if (rw_flag == READING) { 8634472b590SDavid van Moolenbroek assert(bufqsize <= LMFS_MAX_PREFETCH); 8644472b590SDavid van Moolenbroek 865433d6423SLionel Sambuc for(i = 0; i < bufqsize; i++) { 866433d6423SLionel Sambuc assert(bufq[i] != NULL); 867433d6423SLionel Sambuc assert(bufq[i]->lmfs_count > 0); 868433d6423SLionel Sambuc } 869433d6423SLionel Sambuc 870433d6423SLionel Sambuc /* therefore they are all 'in use' and must be at least this many */ 871433d6423SLionel Sambuc assert(start_in_use >= start_bufqsize); 872433d6423SLionel Sambuc } 873433d6423SLionel Sambuc 874433d6423SLionel Sambuc assert(dev != NO_DEV); 875433d6423SLionel Sambuc assert(fs_block_size > 0); 8766c46a77dSDavid van Moolenbroek assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS); 877433d6423SLionel Sambuc 8784472b590SDavid van Moolenbroek /* For WRITING, (Shell) sort buffers on lmfs_blocknr. 8794472b590SDavid van Moolenbroek * For READING, the buffers are already sorted. 8804472b590SDavid van Moolenbroek */ 8814472b590SDavid van Moolenbroek if (rw_flag == WRITING) 8824472b590SDavid van Moolenbroek sort_blocks(bufq, bufqsize); 883433d6423SLionel Sambuc 884433d6423SLionel Sambuc /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything 885433d6423SLionel Sambuc * went fine, otherwise the error code for the first failed transfer. 886433d6423SLionel Sambuc */ 887433d6423SLionel Sambuc while (bufqsize > 0) { 8884472b590SDavid van Moolenbroek unsigned int p, nblocks = 0, niovecs = 0; 889433d6423SLionel Sambuc int r; 890433d6423SLionel Sambuc for (iop = iovec; nblocks < bufqsize; nblocks++) { 891433d6423SLionel Sambuc vir_bytes vdata, blockrem; 892433d6423SLionel Sambuc bp = bufq[nblocks]; 893b65ad59eSDavid van Moolenbroek if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks) 894433d6423SLionel Sambuc break; 8956c46a77dSDavid van Moolenbroek blockrem = bp->lmfs_bytes; 8966c46a77dSDavid van Moolenbroek iov_per_block = howmany(blockrem, PAGE_SIZE); 8974472b590SDavid van Moolenbroek if (niovecs > NR_IOREQS - iov_per_block) break; 898433d6423SLionel Sambuc vdata = (vir_bytes) bp->data; 899433d6423SLionel Sambuc for(p = 0; p < iov_per_block; p++) { 9006c46a77dSDavid van Moolenbroek vir_bytes chunk = 9016c46a77dSDavid van Moolenbroek blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE; 902433d6423SLionel Sambuc iop->iov_addr = vdata; 903433d6423SLionel Sambuc iop->iov_size = chunk; 904433d6423SLionel Sambuc vdata += PAGE_SIZE; 905433d6423SLionel Sambuc blockrem -= chunk; 906433d6423SLionel Sambuc iop++; 907433d6423SLionel Sambuc niovecs++; 908433d6423SLionel Sambuc } 909433d6423SLionel Sambuc assert(p == iov_per_block); 910433d6423SLionel Sambuc assert(blockrem == 0); 911433d6423SLionel Sambuc } 912433d6423SLionel Sambuc 913433d6423SLionel Sambuc assert(nblocks > 0); 9144472b590SDavid van Moolenbroek assert(niovecs > 0 && niovecs <= NR_IOREQS); 915433d6423SLionel Sambuc 916433d6423SLionel Sambuc pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size; 917433d6423SLionel Sambuc if (rw_flag == READING) 918433d6423SLionel Sambuc r = bdev_gather(dev, pos, iovec, niovecs, BDEV_NOFLAGS); 919433d6423SLionel Sambuc else 920433d6423SLionel Sambuc r = bdev_scatter(dev, pos, iovec, niovecs, BDEV_NOFLAGS); 921433d6423SLionel Sambuc 922433d6423SLionel Sambuc /* Harvest the results. The driver may have returned an error, or it 923433d6423SLionel Sambuc * may have done less than what we asked for. 924433d6423SLionel Sambuc */ 925433d6423SLionel Sambuc if (r < 0) { 926b65ad59eSDavid van Moolenbroek printf("fs cache: I/O error %d on device %d/%d, " 927b65ad59eSDavid van Moolenbroek "block %"PRIu64"\n", 928433d6423SLionel Sambuc r, major(dev), minor(dev), bufq[0]->lmfs_blocknr); 929433d6423SLionel Sambuc } 930433d6423SLionel Sambuc for (i = 0; i < nblocks; i++) { 931433d6423SLionel Sambuc bp = bufq[i]; 9326c46a77dSDavid van Moolenbroek if (r < (ssize_t)bp->lmfs_bytes) { 933433d6423SLionel Sambuc /* Transfer failed. */ 934433d6423SLionel Sambuc if (i == 0) { 935433d6423SLionel Sambuc bp->lmfs_dev = NO_DEV; /* Invalidate block */ 936433d6423SLionel Sambuc } 937433d6423SLionel Sambuc break; 938433d6423SLionel Sambuc } 939433d6423SLionel Sambuc if (rw_flag == READING) { 9400314acfbSDavid van Moolenbroek lmfs_put_block(bp); 941433d6423SLionel Sambuc } else { 942433d6423SLionel Sambuc MARKCLEAN(bp); 943433d6423SLionel Sambuc } 9446c46a77dSDavid van Moolenbroek r -= bp->lmfs_bytes; 945433d6423SLionel Sambuc } 946433d6423SLionel Sambuc 947433d6423SLionel Sambuc bufq += i; 948433d6423SLionel Sambuc bufqsize -= i; 949433d6423SLionel Sambuc 950433d6423SLionel Sambuc if (rw_flag == READING) { 951433d6423SLionel Sambuc /* Don't bother reading more than the device is willing to 952433d6423SLionel Sambuc * give at this time. Don't forget to release those extras. 953433d6423SLionel Sambuc */ 954433d6423SLionel Sambuc while (bufqsize > 0) { 9554472b590SDavid van Moolenbroek bp = *bufq++; 9564472b590SDavid van Moolenbroek bp->lmfs_dev = NO_DEV; /* invalidate block */ 9574472b590SDavid van Moolenbroek lmfs_put_block(bp); 958433d6423SLionel Sambuc bufqsize--; 959433d6423SLionel Sambuc } 960433d6423SLionel Sambuc } 961433d6423SLionel Sambuc if (rw_flag == WRITING && i == 0) { 962433d6423SLionel Sambuc /* We're not making progress, this means we might keep 963433d6423SLionel Sambuc * looping. Buffers remain dirty if un-written. Buffers are 964433d6423SLionel Sambuc * lost if invalidate()d or LRU-removed while dirty. This 965433d6423SLionel Sambuc * is better than keeping unwritable blocks around forever.. 966433d6423SLionel Sambuc */ 967433d6423SLionel Sambuc break; 968433d6423SLionel Sambuc } 969433d6423SLionel Sambuc } 970433d6423SLionel Sambuc 971433d6423SLionel Sambuc if(rw_flag == READING) { 972433d6423SLionel Sambuc assert(start_in_use >= start_bufqsize); 973433d6423SLionel Sambuc 974433d6423SLionel Sambuc /* READING callers assume all bufs are released. */ 975433d6423SLionel Sambuc assert(start_in_use - start_bufqsize == bufs_in_use); 976433d6423SLionel Sambuc } 977433d6423SLionel Sambuc } 978433d6423SLionel Sambuc 979433d6423SLionel Sambuc /*===========================================================================* 9804472b590SDavid van Moolenbroek * lmfs_readahead * 9814472b590SDavid van Moolenbroek *===========================================================================*/ 9824472b590SDavid van Moolenbroek void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks, 9834472b590SDavid van Moolenbroek size_t last_size) 9844472b590SDavid van Moolenbroek { 9854472b590SDavid van Moolenbroek /* Read ahead 'nblocks' blocks starting from the block 'base_block' on device 9864472b590SDavid van Moolenbroek * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH, 9874472b590SDavid van Moolenbroek * inclusive. All blocks have the file system's block size, possibly except the 9884472b590SDavid van Moolenbroek * last block in the range, which is of size 'last_size'. The caller must 9894472b590SDavid van Moolenbroek * ensure that none of the blocks in the range are already in the cache. 9904472b590SDavid van Moolenbroek * However, the caller must also not rely on all or even any of the blocks to 9914472b590SDavid van Moolenbroek * be present in the cache afterwards--failures are (deliberately!) ignored. 9924472b590SDavid van Moolenbroek */ 993*129adfebSDavid van Moolenbroek static noxfer_buf_ptr_t bufq[LMFS_MAX_PREFETCH]; /* static for size only */ 9944472b590SDavid van Moolenbroek struct buf *bp; 9954472b590SDavid van Moolenbroek unsigned int count; 9964472b590SDavid van Moolenbroek int r; 9974472b590SDavid van Moolenbroek 9984472b590SDavid van Moolenbroek assert(nblocks >= 1 && nblocks <= LMFS_MAX_PREFETCH); 9994472b590SDavid van Moolenbroek 10004472b590SDavid van Moolenbroek for (count = 0; count < nblocks; count++) { 10014472b590SDavid van Moolenbroek if (count == nblocks - 1) 10024472b590SDavid van Moolenbroek r = lmfs_get_partial_block(&bp, dev, base_block + count, 10034472b590SDavid van Moolenbroek NO_READ, last_size); 10044472b590SDavid van Moolenbroek else 10054472b590SDavid van Moolenbroek r = lmfs_get_block(&bp, dev, base_block + count, NO_READ); 10064472b590SDavid van Moolenbroek 10074472b590SDavid van Moolenbroek if (r != OK) 10084472b590SDavid van Moolenbroek break; 10094472b590SDavid van Moolenbroek 10104472b590SDavid van Moolenbroek /* We could add a flag that makes the get_block() calls fail if the 10114472b590SDavid van Moolenbroek * block is already in the cache, but it is not a major concern if it 10124472b590SDavid van Moolenbroek * is: we just perform a useless read in that case. However, if the 10134472b590SDavid van Moolenbroek * block is cached *and* dirty, we are about to lose its new contents. 10144472b590SDavid van Moolenbroek */ 10154472b590SDavid van Moolenbroek assert(lmfs_isclean(bp)); 10164472b590SDavid van Moolenbroek 10174472b590SDavid van Moolenbroek bufq[count] = bp; 10184472b590SDavid van Moolenbroek } 10194472b590SDavid van Moolenbroek 10204472b590SDavid van Moolenbroek rw_scattered(dev, bufq, count, READING); 10214472b590SDavid van Moolenbroek } 10224472b590SDavid van Moolenbroek 10234472b590SDavid van Moolenbroek /*===========================================================================* 10244472b590SDavid van Moolenbroek * lmfs_prefetch * 10254472b590SDavid van Moolenbroek *===========================================================================*/ 10264472b590SDavid van Moolenbroek unsigned int lmfs_readahead_limit(void) 10274472b590SDavid van Moolenbroek { 10284472b590SDavid van Moolenbroek /* Return the maximum number of blocks that should be read ahead at once. The 10294472b590SDavid van Moolenbroek * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive. 10304472b590SDavid van Moolenbroek */ 10314472b590SDavid van Moolenbroek unsigned int max_transfer, max_bufs; 10324472b590SDavid van Moolenbroek 10334472b590SDavid van Moolenbroek /* The returned value is the minimum of two factors: the maximum number of 10344472b590SDavid van Moolenbroek * blocks that can be transferred in a single I/O gather request (see how 10354472b590SDavid van Moolenbroek * rw_scattered() generates I/O requests), and a policy limit on the number 10364472b590SDavid van Moolenbroek * of buffers that any read-ahead operation may use (that is, thrash). 10374472b590SDavid van Moolenbroek */ 10384472b590SDavid van Moolenbroek max_transfer = NR_IOREQS / MAX(fs_block_size / PAGE_SIZE, 1); 10394472b590SDavid van Moolenbroek 10404472b590SDavid van Moolenbroek /* The constants have been imported from MFS as is, and may need tuning. */ 10414472b590SDavid van Moolenbroek if (nr_bufs < 50) 10424472b590SDavid van Moolenbroek max_bufs = 18; 10434472b590SDavid van Moolenbroek else 10444472b590SDavid van Moolenbroek max_bufs = nr_bufs - 4; 10454472b590SDavid van Moolenbroek 10464472b590SDavid van Moolenbroek return MIN(max_transfer, max_bufs); 10474472b590SDavid van Moolenbroek } 10484472b590SDavid van Moolenbroek 10494472b590SDavid van Moolenbroek /*===========================================================================* 10504472b590SDavid van Moolenbroek * lmfs_prefetch * 10514472b590SDavid van Moolenbroek *===========================================================================*/ 10524472b590SDavid van Moolenbroek void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks) 10534472b590SDavid van Moolenbroek { 10544472b590SDavid van Moolenbroek /* The given set of blocks is expected to be needed soon, so prefetch a 10554472b590SDavid van Moolenbroek * convenient subset. The blocks are expected to be sorted by likelihood of 10564472b590SDavid van Moolenbroek * being accessed soon, making the first block of the set the most important 10574472b590SDavid van Moolenbroek * block to prefetch right now. The caller must have made sure that the blocks 10584472b590SDavid van Moolenbroek * are not in the cache already. The array may have duplicate block numbers. 10594472b590SDavid van Moolenbroek */ 10604472b590SDavid van Moolenbroek bitchunk_t blocks_before[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)]; 10614472b590SDavid van Moolenbroek bitchunk_t blocks_after[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)]; 10624472b590SDavid van Moolenbroek block64_t block, base_block; 10634472b590SDavid van Moolenbroek unsigned int i, bit, nr_before, nr_after, span, limit, nr_blocks; 10644472b590SDavid van Moolenbroek 10654472b590SDavid van Moolenbroek if (nblocks == 0) 10664472b590SDavid van Moolenbroek return; 10674472b590SDavid van Moolenbroek 10684472b590SDavid van Moolenbroek /* Here is the deal. We are going to prefetch one range only, because seeking 10694472b590SDavid van Moolenbroek * is too expensive for just prefetching. The range we select should at least 10704472b590SDavid van Moolenbroek * include the first ("base") block of the given set, since that is the block 10714472b590SDavid van Moolenbroek * the caller is primarily interested in. Thus, the rest of the range is 10724472b590SDavid van Moolenbroek * going to have to be directly around this base block. We first check which 10734472b590SDavid van Moolenbroek * blocks from the set fall just before and after the base block, which then 10744472b590SDavid van Moolenbroek * allows us to construct a contiguous range of desired blocks directly 10754472b590SDavid van Moolenbroek * around the base block, in O(n) time. As a natural part of this, we ignore 10764472b590SDavid van Moolenbroek * duplicate blocks in the given set. We then read from the beginning of this 10774472b590SDavid van Moolenbroek * range, in order to maximize the chance that a next prefetch request will 10784472b590SDavid van Moolenbroek * continue from the last disk position without requiring a seek. However, we 10794472b590SDavid van Moolenbroek * do correct for the maximum number of blocks we can (or should) read in at 10804472b590SDavid van Moolenbroek * once, such that we will still end up reading the base block. 10814472b590SDavid van Moolenbroek */ 10824472b590SDavid van Moolenbroek base_block = blockset[0]; 10834472b590SDavid van Moolenbroek 10844472b590SDavid van Moolenbroek memset(blocks_before, 0, sizeof(blocks_before)); 10854472b590SDavid van Moolenbroek memset(blocks_after, 0, sizeof(blocks_after)); 10864472b590SDavid van Moolenbroek 10874472b590SDavid van Moolenbroek for (i = 1; i < nblocks; i++) { 10884472b590SDavid van Moolenbroek block = blockset[i]; 10894472b590SDavid van Moolenbroek 10904472b590SDavid van Moolenbroek if (block < base_block && block + LMFS_MAX_PREFETCH >= base_block) { 10914472b590SDavid van Moolenbroek bit = base_block - block - 1; 10924472b590SDavid van Moolenbroek assert(bit < LMFS_MAX_PREFETCH); 10934472b590SDavid van Moolenbroek SET_BIT(blocks_before, bit); 10944472b590SDavid van Moolenbroek } else if (block > base_block && 10954472b590SDavid van Moolenbroek block - LMFS_MAX_PREFETCH <= base_block) { 10964472b590SDavid van Moolenbroek bit = block - base_block - 1; 10974472b590SDavid van Moolenbroek assert(bit < LMFS_MAX_PREFETCH); 10984472b590SDavid van Moolenbroek SET_BIT(blocks_after, bit); 10994472b590SDavid van Moolenbroek } 11004472b590SDavid van Moolenbroek } 11014472b590SDavid van Moolenbroek 11024472b590SDavid van Moolenbroek for (nr_before = 0; nr_before < LMFS_MAX_PREFETCH; nr_before++) 11034472b590SDavid van Moolenbroek if (!GET_BIT(blocks_before, nr_before)) 11044472b590SDavid van Moolenbroek break; 11054472b590SDavid van Moolenbroek 11064472b590SDavid van Moolenbroek for (nr_after = 0; nr_after < LMFS_MAX_PREFETCH; nr_after++) 11074472b590SDavid van Moolenbroek if (!GET_BIT(blocks_after, nr_after)) 11084472b590SDavid van Moolenbroek break; 11094472b590SDavid van Moolenbroek 11104472b590SDavid van Moolenbroek /* The number of blocks to prefetch is the minimum of two factors: the number 11114472b590SDavid van Moolenbroek * of blocks in the range around the base block, and the maximum number of 11124472b590SDavid van Moolenbroek * blocks that should be read ahead at once at all. 11134472b590SDavid van Moolenbroek */ 11144472b590SDavid van Moolenbroek span = nr_before + 1 + nr_after; 11154472b590SDavid van Moolenbroek limit = lmfs_readahead_limit(); 11164472b590SDavid van Moolenbroek 11174472b590SDavid van Moolenbroek nr_blocks = MIN(span, limit); 11184472b590SDavid van Moolenbroek assert(nr_blocks >= 1 && nr_blocks <= LMFS_MAX_PREFETCH); 11194472b590SDavid van Moolenbroek 11204472b590SDavid van Moolenbroek /* Start prefetching from the lowest block within the contiguous range, but 11214472b590SDavid van Moolenbroek * make sure that we read at least the original base block itself, too. 11224472b590SDavid van Moolenbroek */ 11234472b590SDavid van Moolenbroek base_block -= MIN(nr_before, nr_blocks - 1); 11244472b590SDavid van Moolenbroek 11254472b590SDavid van Moolenbroek lmfs_readahead(dev, base_block, nr_blocks, fs_block_size); 11264472b590SDavid van Moolenbroek } 11274472b590SDavid van Moolenbroek 11284472b590SDavid van Moolenbroek /*===========================================================================* 11294472b590SDavid van Moolenbroek * lmfs_flushdev * 11304472b590SDavid van Moolenbroek *===========================================================================*/ 11314472b590SDavid van Moolenbroek void lmfs_flushdev(dev_t dev) 11324472b590SDavid van Moolenbroek { 11334472b590SDavid van Moolenbroek /* Flush all dirty blocks for one device. */ 11344472b590SDavid van Moolenbroek 11354472b590SDavid van Moolenbroek register struct buf *bp; 1136*129adfebSDavid van Moolenbroek static noxfer_buf_ptr_t *dirty; 11374472b590SDavid van Moolenbroek static unsigned int dirtylistsize = 0; 11384472b590SDavid van Moolenbroek unsigned int ndirty; 11394472b590SDavid van Moolenbroek 11404472b590SDavid van Moolenbroek if(dirtylistsize != nr_bufs) { 11414472b590SDavid van Moolenbroek if(dirtylistsize > 0) { 11424472b590SDavid van Moolenbroek assert(dirty != NULL); 11434472b590SDavid van Moolenbroek free(dirty); 11444472b590SDavid van Moolenbroek } 11454472b590SDavid van Moolenbroek if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs))) 11464472b590SDavid van Moolenbroek panic("couldn't allocate dirty buf list"); 11474472b590SDavid van Moolenbroek dirtylistsize = nr_bufs; 11484472b590SDavid van Moolenbroek } 11494472b590SDavid van Moolenbroek 11504472b590SDavid van Moolenbroek for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) { 11514472b590SDavid van Moolenbroek /* Do not flush dirty blocks that are in use (lmfs_count>0): the file 11524472b590SDavid van Moolenbroek * system may mark the block as dirty before changing its contents, in 11534472b590SDavid van Moolenbroek * which case the new contents could end up being lost. 11544472b590SDavid van Moolenbroek */ 11554472b590SDavid van Moolenbroek if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) { 11564472b590SDavid van Moolenbroek dirty[ndirty++] = bp; 11574472b590SDavid van Moolenbroek } 11584472b590SDavid van Moolenbroek } 11594472b590SDavid van Moolenbroek 11604472b590SDavid van Moolenbroek rw_scattered(dev, dirty, ndirty, WRITING); 11614472b590SDavid van Moolenbroek } 11624472b590SDavid van Moolenbroek 11634472b590SDavid van Moolenbroek /*===========================================================================* 1164433d6423SLionel Sambuc * rm_lru * 1165433d6423SLionel Sambuc *===========================================================================*/ 1166433d6423SLionel Sambuc static void rm_lru(struct buf *bp) 1167433d6423SLionel Sambuc { 1168433d6423SLionel Sambuc /* Remove a block from its LRU chain. */ 1169433d6423SLionel Sambuc struct buf *next_ptr, *prev_ptr; 1170433d6423SLionel Sambuc 1171433d6423SLionel Sambuc next_ptr = bp->lmfs_next; /* successor on LRU chain */ 1172433d6423SLionel Sambuc prev_ptr = bp->lmfs_prev; /* predecessor on LRU chain */ 1173433d6423SLionel Sambuc if (prev_ptr != NULL) 1174433d6423SLionel Sambuc prev_ptr->lmfs_next = next_ptr; 1175433d6423SLionel Sambuc else 1176433d6423SLionel Sambuc front = next_ptr; /* this block was at front of chain */ 1177433d6423SLionel Sambuc 1178433d6423SLionel Sambuc if (next_ptr != NULL) 1179433d6423SLionel Sambuc next_ptr->lmfs_prev = prev_ptr; 1180433d6423SLionel Sambuc else 1181433d6423SLionel Sambuc rear = prev_ptr; /* this block was at rear of chain */ 1182433d6423SLionel Sambuc } 1183433d6423SLionel Sambuc 1184433d6423SLionel Sambuc /*===========================================================================* 1185433d6423SLionel Sambuc * cache_resize * 1186433d6423SLionel Sambuc *===========================================================================*/ 11871311233cSDavid van Moolenbroek static void cache_resize(size_t blocksize, unsigned int bufs) 1188433d6423SLionel Sambuc { 1189433d6423SLionel Sambuc struct buf *bp; 1190433d6423SLionel Sambuc 1191433d6423SLionel Sambuc assert(blocksize > 0); 1192433d6423SLionel Sambuc assert(bufs >= MINBUFS); 1193433d6423SLionel Sambuc 1194433d6423SLionel Sambuc for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) 1195433d6423SLionel Sambuc if(bp->lmfs_count != 0) panic("change blocksize with buffer in use"); 1196433d6423SLionel Sambuc 1197433d6423SLionel Sambuc lmfs_buf_pool(bufs); 1198433d6423SLionel Sambuc 1199433d6423SLionel Sambuc fs_block_size = blocksize; 1200433d6423SLionel Sambuc } 1201433d6423SLionel Sambuc 12020314acfbSDavid van Moolenbroek static void cache_heuristic_check(void) 1203433d6423SLionel Sambuc { 1204433d6423SLionel Sambuc int bufs, d; 1205433d6423SLionel Sambuc 12061311233cSDavid van Moolenbroek bufs = fs_bufs_heuristic(MINBUFS, fs_btotal, fs_bused, fs_block_size); 1207433d6423SLionel Sambuc 1208433d6423SLionel Sambuc /* set the cache to the new heuristic size if the new one 1209433d6423SLionel Sambuc * is more than 10% off from the current one. 1210433d6423SLionel Sambuc */ 1211433d6423SLionel Sambuc d = bufs-nr_bufs; 1212433d6423SLionel Sambuc if(d < 0) d = -d; 1213433d6423SLionel Sambuc if(d*100/nr_bufs > 10) { 1214433d6423SLionel Sambuc cache_resize(fs_block_size, bufs); 1215433d6423SLionel Sambuc } 1216433d6423SLionel Sambuc } 1217433d6423SLionel Sambuc 1218433d6423SLionel Sambuc /*===========================================================================* 1219433d6423SLionel Sambuc * lmfs_set_blocksize * 1220433d6423SLionel Sambuc *===========================================================================*/ 12211311233cSDavid van Moolenbroek void lmfs_set_blocksize(size_t new_block_size) 1222433d6423SLionel Sambuc { 1223433d6423SLionel Sambuc cache_resize(new_block_size, MINBUFS); 12240314acfbSDavid van Moolenbroek cache_heuristic_check(); 1225433d6423SLionel Sambuc 1226433d6423SLionel Sambuc /* Decide whether to use seconday cache or not. 12270314acfbSDavid van Moolenbroek * Only do this if the block size is a multiple of the page size, and using 12280314acfbSDavid van Moolenbroek * the VM cache has been enabled for this FS. 1229433d6423SLionel Sambuc */ 1230433d6423SLionel Sambuc 1231433d6423SLionel Sambuc vmcache = 0; 1232433d6423SLionel Sambuc 1233433d6423SLionel Sambuc if(may_use_vmcache && !(new_block_size % PAGE_SIZE)) 1234433d6423SLionel Sambuc vmcache = 1; 1235433d6423SLionel Sambuc } 1236433d6423SLionel Sambuc 1237433d6423SLionel Sambuc /*===========================================================================* 1238433d6423SLionel Sambuc * lmfs_buf_pool * 1239433d6423SLionel Sambuc *===========================================================================*/ 1240433d6423SLionel Sambuc void lmfs_buf_pool(int new_nr_bufs) 1241433d6423SLionel Sambuc { 1242433d6423SLionel Sambuc /* Initialize the buffer pool. */ 1243433d6423SLionel Sambuc register struct buf *bp; 1244433d6423SLionel Sambuc 1245433d6423SLionel Sambuc assert(new_nr_bufs >= MINBUFS); 1246433d6423SLionel Sambuc 1247433d6423SLionel Sambuc if(nr_bufs > 0) { 1248433d6423SLionel Sambuc assert(buf); 1249c5beebb6SDavid van Moolenbroek lmfs_flushall(); 1250433d6423SLionel Sambuc for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) { 1251433d6423SLionel Sambuc if(bp->data) { 1252433d6423SLionel Sambuc assert(bp->lmfs_bytes > 0); 1253433d6423SLionel Sambuc munmap_t(bp->data, bp->lmfs_bytes); 1254433d6423SLionel Sambuc } 1255433d6423SLionel Sambuc } 1256433d6423SLionel Sambuc } 1257433d6423SLionel Sambuc 1258433d6423SLionel Sambuc if(buf) 1259433d6423SLionel Sambuc free(buf); 1260433d6423SLionel Sambuc 1261433d6423SLionel Sambuc if(!(buf = calloc(sizeof(buf[0]), new_nr_bufs))) 1262433d6423SLionel Sambuc panic("couldn't allocate buf list (%d)", new_nr_bufs); 1263433d6423SLionel Sambuc 1264433d6423SLionel Sambuc if(buf_hash) 1265433d6423SLionel Sambuc free(buf_hash); 1266433d6423SLionel Sambuc if(!(buf_hash = calloc(sizeof(buf_hash[0]), new_nr_bufs))) 1267433d6423SLionel Sambuc panic("couldn't allocate buf hash list (%d)", new_nr_bufs); 1268433d6423SLionel Sambuc 1269433d6423SLionel Sambuc nr_bufs = new_nr_bufs; 1270433d6423SLionel Sambuc 1271433d6423SLionel Sambuc bufs_in_use = 0; 1272433d6423SLionel Sambuc front = &buf[0]; 1273433d6423SLionel Sambuc rear = &buf[nr_bufs - 1]; 1274433d6423SLionel Sambuc 1275433d6423SLionel Sambuc for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) { 1276433d6423SLionel Sambuc bp->lmfs_blocknr = NO_BLOCK; 1277433d6423SLionel Sambuc bp->lmfs_dev = NO_DEV; 1278433d6423SLionel Sambuc bp->lmfs_next = bp + 1; 1279433d6423SLionel Sambuc bp->lmfs_prev = bp - 1; 1280433d6423SLionel Sambuc bp->data = NULL; 1281433d6423SLionel Sambuc bp->lmfs_bytes = 0; 1282433d6423SLionel Sambuc } 1283433d6423SLionel Sambuc front->lmfs_prev = NULL; 1284433d6423SLionel Sambuc rear->lmfs_next = NULL; 1285433d6423SLionel Sambuc 1286433d6423SLionel Sambuc for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) bp->lmfs_hash = bp->lmfs_next; 1287433d6423SLionel Sambuc buf_hash[0] = front; 1288433d6423SLionel Sambuc } 1289433d6423SLionel Sambuc 1290433d6423SLionel Sambuc void lmfs_flushall(void) 1291433d6423SLionel Sambuc { 1292433d6423SLionel Sambuc struct buf *bp; 1293433d6423SLionel Sambuc for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++) 1294433d6423SLionel Sambuc if(bp->lmfs_dev != NO_DEV && !lmfs_isclean(bp)) 1295ebd3c067SDavid van Moolenbroek lmfs_flushdev(bp->lmfs_dev); 12961311233cSDavid van Moolenbroek 12971311233cSDavid van Moolenbroek /* This is the moment where it is least likely (although certainly not 12981311233cSDavid van Moolenbroek * impossible!) that there are buffers in use, since buffers should not 12991311233cSDavid van Moolenbroek * be held across file system syncs. See if we already intended to 13001311233cSDavid van Moolenbroek * resize the buffer cache, but couldn't. Be aware that we may be 13011311233cSDavid van Moolenbroek * called indirectly from within lmfs_change_blockusage(), so care must 13021311233cSDavid van Moolenbroek * be taken not to recurse infinitely. TODO: see if it is better to 13031311233cSDavid van Moolenbroek * resize the cache from here *only*, thus guaranteeing a clean cache. 13041311233cSDavid van Moolenbroek */ 13051311233cSDavid van Moolenbroek lmfs_change_blockusage(0); 1306433d6423SLionel Sambuc } 1307433d6423SLionel Sambuc 13081311233cSDavid van Moolenbroek size_t lmfs_fs_block_size(void) 1309433d6423SLionel Sambuc { 1310433d6423SLionel Sambuc return fs_block_size; 1311433d6423SLionel Sambuc } 1312433d6423SLionel Sambuc 1313433d6423SLionel Sambuc void lmfs_may_use_vmcache(int ok) 1314433d6423SLionel Sambuc { 1315433d6423SLionel Sambuc may_use_vmcache = ok; 1316433d6423SLionel Sambuc } 1317