1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Implements new VFS/VM coherency functions. For conforming VFSs 37 * we treat the backing VM object slightly differently. Instead of 38 * maintaining a number of pages to exactly fit the size of the file 39 * we instead maintain pages to fit the entire contents of the last 40 * buffer cache buffer used by the file. 41 * 42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed 43 * sized buffers this greatly reduces the complexity of VFS/VM interactions. 44 * 45 * Truncations no longer invalidate pages covered by the buffer cache 46 * beyond the file EOF which still fit within the file's last buffer. 47 * We simply unmap them and do not allow userland to fault them in. 48 * 49 * The VFS is no longer responsible for zero-filling buffers during a 50 * truncation, the last buffer will be automatically zero-filled by 51 * nvtruncbuf(). 52 * 53 * This code is intended to (eventually) replace vtruncbuf() and 54 * vnode_pager_setsize(). 55 */ 56 57 #include <sys/param.h> 58 #include <sys/systm.h> 59 #include <sys/buf.h> 60 #include <sys/conf.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/kernel.h> 64 #include <sys/malloc.h> 65 #include <sys/mount.h> 66 #include <sys/proc.h> 67 #include <sys/socket.h> 68 #include <sys/stat.h> 69 #include <sys/sysctl.h> 70 #include <sys/unistd.h> 71 #include <sys/vmmeter.h> 72 #include <sys/vnode.h> 73 74 #include <machine/limits.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_extern.h> 79 #include <vm/vm_kern.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_pager.h> 84 #include <vm/vnode_pager.h> 85 #include <vm/vm_zone.h> 86 87 #include <sys/buf2.h> 88 #include <sys/thread2.h> 89 #include <sys/sysref2.h> 90 #include <sys/mplock2.h> 91 92 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 93 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data); 94 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 95 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data); 96 97 /* 98 * Truncate a file's buffer and pages to a specified length. The 99 * byte-granular length of the file is specified along with the block 100 * size of the buffer containing that offset. 101 * 102 * If the last buffer straddles the length its contents will be zero-filled 103 * as appropriate. All buffers and pages after the last buffer will be 104 * destroyed. The last buffer itself will be destroyed only if the length 105 * is exactly aligned with it. 106 * 107 * UFS typically passes the old block size prior to the actual truncation, 108 * then later resizes the block based on the new file size. NFS uses a 109 * fixed block size and doesn't care. HAMMER uses a block size based on 110 * the offset which is fixed for any particular offset. 111 * 112 * When zero-filling we must bdwrite() to avoid a window of opportunity 113 * where the kernel might throw away a clean buffer and the filesystem 114 * then attempts to bread() it again before completing (or as part of) 115 * the extension. The filesystem is still responsible for zero-filling 116 * any remainder when writing to the media in the strategy function when 117 * it is able to do so without the page being mapped. The page may still 118 * be mapped by userland here. 119 * 120 * When modifying a buffer we must clear any cached raw disk offset. 121 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, 122 * never overwrite existing data blocks. 123 */ 124 int 125 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff) 126 { 127 off_t truncloffset; 128 off_t truncboffset; 129 const char *filename; 130 struct buf *bp; 131 int count; 132 int error; 133 134 /* 135 * Round up to the *next* block, then destroy the buffers in question. 136 * Since we are only removing some of the buffers we must rely on the 137 * scan count to determine whether a loop is necessary. 138 * 139 * Destroy any pages beyond the last buffer. 140 */ 141 if (boff < 0) 142 boff = (int)(length % blksize); 143 if (boff) 144 truncloffset = length + (blksize - boff); 145 else 146 truncloffset = length; 147 148 lwkt_gettoken(&vp->v_token); 149 do { 150 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 151 nvtruncbuf_bp_trunc_cmp, 152 nvtruncbuf_bp_trunc, &truncloffset); 153 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 154 nvtruncbuf_bp_trunc_cmp, 155 nvtruncbuf_bp_trunc, &truncloffset); 156 } while(count); 157 158 nvnode_pager_setsize(vp, length, blksize, boff); 159 160 /* 161 * Zero-fill the area beyond the file EOF that still fits within 162 * the last buffer. We must mark the buffer as dirty even though 163 * the modified area is beyond EOF to avoid races where the kernel 164 * might flush the buffer before the filesystem is able to reallocate 165 * the block. 166 * 167 * The VFS is responsible for dealing with the actual truncation. 168 */ 169 if (boff) { 170 truncboffset = length - boff; 171 error = bread(vp, truncboffset, blksize, &bp); 172 if (error == 0) { 173 bzero(bp->b_data + boff, blksize - boff); 174 if (bp->b_flags & B_DELWRI) { 175 if (bp->b_dirtyoff > boff) 176 bp->b_dirtyoff = boff; 177 if (bp->b_dirtyend > boff) 178 bp->b_dirtyend = boff; 179 } 180 bp->b_bio2.bio_offset = NOOFFSET; 181 bdwrite(bp); 182 } 183 } else { 184 error = 0; 185 } 186 187 /* 188 * For safety, fsync any remaining metadata if the file is not being 189 * truncated to 0. Since the metadata does not represent the entire 190 * dirty list we have to rely on the hit count to ensure that we get 191 * all of it. 192 * 193 * This is typically applicable only to UFS. NFS and HAMMER do 194 * not store indirect blocks in the per-vnode buffer cache. 195 */ 196 if (length > 0) { 197 do { 198 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 199 nvtruncbuf_bp_metasync_cmp, 200 nvtruncbuf_bp_metasync, vp); 201 } while (count); 202 } 203 204 /* 205 * It is possible to have in-progress I/O from buffers that were 206 * not part of the truncation. This should not happen if we 207 * are truncating to 0-length. 208 */ 209 bio_track_wait(&vp->v_track_write, 0, 0); 210 211 /* 212 * Debugging only 213 */ 214 spin_lock_wr(&vp->v_spinlock); 215 filename = TAILQ_FIRST(&vp->v_namecache) ? 216 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"; 217 spin_unlock_wr(&vp->v_spinlock); 218 219 /* 220 * Make sure no buffers were instantiated while we were trying 221 * to clean out the remaining VM pages. This could occur due 222 * to busy dirty VM pages being flushed out to disk. 223 */ 224 do { 225 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 226 nvtruncbuf_bp_trunc_cmp, 227 nvtruncbuf_bp_trunc, &truncloffset); 228 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 229 nvtruncbuf_bp_trunc_cmp, 230 nvtruncbuf_bp_trunc, &truncloffset); 231 if (count) { 232 kprintf("Warning: vtruncbuf(): Had to re-clean %d " 233 "left over buffers in %s\n", count, filename); 234 } 235 } while(count); 236 237 lwkt_reltoken(&vp->v_token); 238 239 return (error); 240 } 241 242 /* 243 * The callback buffer is beyond the new file EOF and must be destroyed. 244 * Note that the compare function must conform to the RB_SCAN's requirements. 245 */ 246 static 247 int 248 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 249 { 250 if (bp->b_loffset >= *(off_t *)data) 251 return(0); 252 return(-1); 253 } 254 255 static 256 int 257 nvtruncbuf_bp_trunc(struct buf *bp, void *data) 258 { 259 /* 260 * Do not try to use a buffer we cannot immediately lock, but sleep 261 * anyway to prevent a livelock. The code will loop until all buffers 262 * can be acted upon. 263 */ 264 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 265 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 266 BUF_UNLOCK(bp); 267 } else { 268 bremfree(bp); 269 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); 270 brelse(bp); 271 } 272 return(1); 273 } 274 275 /* 276 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 277 * blocks (with a negative loffset) are scanned. 278 * Note that the compare function must conform to the RB_SCAN's requirements. 279 */ 280 static int 281 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 282 { 283 if (bp->b_loffset < 0) 284 return(0); 285 return(1); 286 } 287 288 static int 289 nvtruncbuf_bp_metasync(struct buf *bp, void *data) 290 { 291 struct vnode *vp = data; 292 293 if (bp->b_flags & B_DELWRI) { 294 /* 295 * Do not try to use a buffer we cannot immediately lock, 296 * but sleep anyway to prevent a livelock. The code will 297 * loop until all buffers can be acted upon. 298 */ 299 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 300 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 301 BUF_UNLOCK(bp); 302 } else { 303 bremfree(bp); 304 if (bp->b_vp == vp) 305 bawrite(bp); 306 else 307 bwrite(bp); 308 } 309 return(1); 310 } else { 311 return(0); 312 } 313 } 314 315 /* 316 * Extend a file's buffer and pages to a new, larger size. The block size 317 * at both the old and new length must be passed, but buffer cache operations 318 * will only be performed on the old block. The new nlength/nblksize will 319 * be used to properly set the VM object size. 320 * 321 * To make this explicit we require the old length to passed even though 322 * we can acquire it from vp->v_filesize, which also avoids potential 323 * corruption if the filesystem and vp get desynchronized somehow. 324 * 325 * If the caller intends to immediately write into the newly extended 326 * space pass trivial == 1. If trivial is 0 the original buffer will be 327 * zero-filled as necessary to clean out any junk in the extended space. 328 * 329 * When zero-filling we must bdwrite() to avoid a window of opportunity 330 * where the kernel might throw away a clean buffer and the filesystem 331 * then attempts to bread() it again before completing (or as part of) 332 * the extension. The filesystem is still responsible for zero-filling 333 * any remainder when writing to the media in the strategy function when 334 * it is able to do so without the page being mapped. The page may still 335 * be mapped by userland here. 336 * 337 * When modifying a buffer we must clear any cached raw disk offset. 338 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, 339 * never overwrite existing data blocks. 340 */ 341 int 342 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength, 343 int oblksize, int nblksize, int oboff, int nboff, int trivial) 344 { 345 off_t truncboffset; 346 struct buf *bp; 347 int error; 348 349 error = 0; 350 nvnode_pager_setsize(vp, nlength, nblksize, nboff); 351 if (trivial == 0) { 352 if (oboff < 0) 353 oboff = (int)(olength % oblksize); 354 truncboffset = olength - oboff; 355 356 if (oboff) { 357 error = bread(vp, truncboffset, oblksize, &bp); 358 if (error == 0) { 359 bzero(bp->b_data + oboff, oblksize - oboff); 360 bp->b_bio2.bio_offset = NOOFFSET; 361 bdwrite(bp); 362 } 363 } 364 } 365 return (error); 366 } 367 368 /* 369 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond 370 * the last buffer when truncating. 371 * 372 * This function does not do any zeroing or invalidating of partially 373 * overlapping pages. Zeroing is the responsibility of nvtruncbuf(). 374 * However, it does unmap VM pages from the user address space on a 375 * page-granular (verses buffer cache granular) basis. 376 * 377 * If boff is passed as -1 the base offset of the buffer cache buffer is 378 * calculated from length and blksize. Filesystems such as UFS which deal 379 * with fragments have to specify a boff >= 0 since the base offset cannot 380 * be calculated from length and blksize. 381 * 382 * For UFS blksize is the 'new' blocksize, used only to determine how large 383 * the VM object must become. 384 */ 385 void 386 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff) 387 { 388 vm_pindex_t nobjsize; 389 vm_pindex_t oobjsize; 390 vm_pindex_t pi; 391 vm_object_t object; 392 vm_page_t m; 393 off_t truncboffset; 394 395 /* 396 * Degenerate conditions 397 */ 398 if ((object = vp->v_object) == NULL) 399 return; 400 if (length == vp->v_filesize) 401 return; 402 403 /* 404 * Calculate the size of the VM object, coverage includes 405 * the buffer straddling EOF. If EOF is buffer-aligned 406 * we don't bother. 407 * 408 * Buffers do not have to be page-aligned. Make sure 409 * nobjsize is beyond the last page of the buffer. 410 */ 411 if (boff < 0) 412 boff = (int)(length % blksize); 413 truncboffset = length - boff; 414 oobjsize = object->size; 415 if (boff) 416 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK); 417 else 418 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK); 419 object->size = nobjsize; 420 421 if (length < vp->v_filesize) { 422 /* 423 * File has shrunk, toss any cached pages beyond 424 * the end of the buffer (blksize aligned) for the 425 * new EOF. 426 */ 427 vp->v_filesize = length; 428 if (nobjsize < oobjsize) { 429 vm_object_page_remove(object, nobjsize, oobjsize, 430 FALSE); 431 } 432 433 /* 434 * Unmap any pages (page aligned) beyond the new EOF. 435 * The pages remain part of the (last) buffer and are not 436 * invalidated. 437 */ 438 pi = OFF_TO_IDX(length + PAGE_MASK); 439 lwkt_gettoken(&vm_token); 440 while (pi < nobjsize) { 441 do { 442 m = vm_page_lookup(object, pi); 443 } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz")); 444 if (m) { 445 vm_page_busy(m); 446 vm_page_protect(m, VM_PROT_NONE); 447 vm_page_wakeup(m); 448 } 449 ++pi; 450 } 451 lwkt_reltoken(&vm_token); 452 } else { 453 /* 454 * File has expanded. 455 */ 456 vp->v_filesize = length; 457 } 458 } 459