1 /* $OpenBSD: vfs_sync.c,v 1.19 2001/06/22 14:14:11 deraadt Exp $ */ 2 3 /* 4 * Portions of this code are: 5 * 6 * Copyright (c) 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 */ 42 43 /* 44 * Syncer daemon 45 */ 46 47 #include <sys/queue.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vnode.h> 53 #include <sys/buf.h> 54 #include <sys/malloc.h> 55 56 #include <sys/kernel.h> 57 58 #ifdef FFS_SOFTUPDATES 59 int softdep_process_worklist __P((struct mount *)); 60 #endif 61 62 /* 63 * The workitem queue. 64 */ 65 #define SYNCER_MAXDELAY 32 /* maximum sync delay time */ 66 #define SYNCER_DEFAULT 30 /* default sync delay time */ 67 int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 68 time_t syncdelay = SYNCER_DEFAULT; /* time to delay syncing vnodes */ 69 70 int rushjob = 0; /* number of slots to run ASAP */ 71 int stat_rush_requests = 0; /* number of rush requests */ 72 73 static int syncer_delayno = 0; 74 static long syncer_mask; 75 LIST_HEAD(synclist, vnode); 76 static struct synclist *syncer_workitem_pending; 77 78 struct proc *syncerproc; 79 80 /* 81 * The workitem queue. 82 * 83 * It is useful to delay writes of file data and filesystem metadata 84 * for tens of seconds so that quickly created and deleted files need 85 * not waste disk bandwidth being created and removed. To realize this, 86 * we append vnodes to a "workitem" queue. When running with a soft 87 * updates implementation, most pending metadata dependencies should 88 * not wait for more than a few seconds. Thus, mounted on block devices 89 * are delayed only about a half the time that file data is delayed. 90 * Similarly, directory updates are more critical, so are only delayed 91 * about a third the time that file data is delayed. Thus, there are 92 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 93 * one each second (driven off the filesystem syner process). The 94 * syncer_delayno variable indicates the next queue that is to be processed. 95 * Items that need to be processed soon are placed in this queue: 96 * 97 * syncer_workitem_pending[syncer_delayno] 98 * 99 * A delay of fifteen seconds is done by placing the request fifteen 100 * entries later in the queue: 101 * 102 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 103 * 104 */ 105 106 void 107 vn_initialize_syncerd() 108 109 { 110 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK, 111 &syncer_mask); 112 syncer_maxdelay = syncer_mask + 1; 113 } 114 115 /* 116 * Add an item to the syncer work queue. 117 */ 118 void 119 vn_syncer_add_to_worklist(vp, delay) 120 struct vnode *vp; 121 int delay; 122 { 123 int s, slot; 124 125 if (delay > syncer_maxdelay - 2) 126 delay = syncer_maxdelay - 2; 127 slot = (syncer_delayno + delay) & syncer_mask; 128 129 s = splbio(); 130 if (vp->v_bioflag & VBIOONSYNCLIST) 131 LIST_REMOVE(vp, v_synclist); 132 133 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 134 vp->v_bioflag |= VBIOONSYNCLIST; 135 splx(s); 136 } 137 138 /* 139 * System filesystem synchronizer daemon. 140 */ 141 142 void 143 sched_sync(p) 144 struct proc *p; 145 { 146 struct synclist *slp; 147 struct vnode *vp; 148 long starttime; 149 int s; 150 151 syncerproc = curproc; 152 153 for (;;) { 154 starttime = time.tv_sec; 155 156 /* 157 * Push files whose dirty time has expired. 158 */ 159 slp = &syncer_workitem_pending[syncer_delayno]; 160 syncer_delayno += 1; 161 if (syncer_delayno == syncer_maxdelay) 162 syncer_delayno = 0; 163 s = splbio(); 164 while ((vp = LIST_FIRST(slp)) != NULL) { 165 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, p) != 0) { 166 /* 167 * If we fail to get the lock, we move this 168 * vnode one second ahead in time. 169 * XXX - no good, but the best we can do. 170 */ 171 vn_syncer_add_to_worklist(vp, 1); 172 continue; 173 } 174 splx(s); 175 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 176 VOP_UNLOCK(vp, 0, p); 177 s = splbio(); 178 if (LIST_FIRST(slp) == vp) { 179 /* 180 * Note: disk vps can remain on the 181 * worklist too with no dirty blocks, but 182 * since sync_fsync() moves it to a different 183 * slot we are safe. 184 */ 185 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL && 186 vp->v_type != VBLK) 187 panic("sched_sync: fsync failed"); 188 /* 189 * Put us back on the worklist. The worklist 190 * routine will remove us from our current 191 * position and then add us back in at a later 192 * position. 193 */ 194 vn_syncer_add_to_worklist(vp, syncdelay); 195 } 196 } 197 198 splx(s); 199 200 #ifdef FFS_SOFTUPDATES 201 /* 202 * Do soft update processing. 203 */ 204 softdep_process_worklist(NULL); 205 #endif 206 207 /* 208 * The variable rushjob allows the kernel to speed up the 209 * processing of the filesystem syncer process. A rushjob 210 * value of N tells the filesystem syncer to process the next 211 * N seconds worth of work on its queue ASAP. Currently rushjob 212 * is used by the soft update code to speed up the filesystem 213 * syncer process when the incore state is getting so far 214 * ahead of the disk that the kernel memory pool is being 215 * threatened with exhaustion. 216 */ 217 if (rushjob > 0) { 218 rushjob -= 1; 219 continue; 220 } 221 /* 222 * If it has taken us less than a second to process the 223 * current work, then wait. Otherwise start right over 224 * again. We can still lose time if any single round 225 * takes more than two seconds, but it does not really 226 * matter as we are just trying to generally pace the 227 * filesystem activity. 228 */ 229 if (time.tv_sec == starttime) 230 tsleep(&lbolt, PPAUSE, "syncer", 0); 231 } 232 } 233 234 /* 235 * Request the syncer daemon to speed up its work. 236 * We never push it to speed up more than half of its 237 * normal turn time, otherwise it could take over the cpu. 238 */ 239 int 240 speedup_syncer() 241 { 242 int s; 243 244 s = splhigh(); 245 if (syncerproc && syncerproc->p_wchan == &lbolt) 246 setrunnable(syncerproc); 247 splx(s); 248 if (rushjob < syncdelay / 2) { 249 rushjob += 1; 250 stat_rush_requests += 1; 251 return 1; 252 } 253 return 0; 254 } 255 256 /* 257 * Routine to create and manage a filesystem syncer vnode. 258 */ 259 #define sync_close nullop 260 int sync_fsync __P((void *)); 261 int sync_inactive __P((void *)); 262 #define sync_reclaim nullop 263 #define sync_lock vop_generic_lock 264 #define sync_unlock vop_generic_unlock 265 int sync_print __P((void *)); 266 #define sync_islocked vop_generic_islocked 267 268 int (**sync_vnodeop_p) __P((void *)); 269 struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 270 { &vop_default_desc, vn_default_error }, 271 { &vop_close_desc, sync_close }, /* close */ 272 { &vop_fsync_desc, sync_fsync }, /* fsync */ 273 { &vop_inactive_desc, sync_inactive }, /* inactive */ 274 { &vop_reclaim_desc, sync_reclaim }, /* reclaim */ 275 { &vop_lock_desc, sync_lock }, /* lock */ 276 { &vop_unlock_desc, sync_unlock }, /* unlock */ 277 { &vop_print_desc, sync_print }, /* print */ 278 { &vop_islocked_desc, sync_islocked }, /* islocked */ 279 { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } 280 }; 281 struct vnodeopv_desc sync_vnodeop_opv_desc = { 282 &sync_vnodeop_p, sync_vnodeop_entries 283 }; 284 285 /* 286 * Create a new filesystem syncer vnode for the specified mount point. 287 */ 288 int 289 vfs_allocate_syncvnode(mp) 290 struct mount *mp; 291 { 292 struct vnode *vp; 293 static long start, incr, next; 294 int error; 295 296 /* Allocate a new vnode */ 297 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 298 mp->mnt_syncer = NULL; 299 return (error); 300 } 301 vp->v_writecount = 1; 302 vp->v_type = VNON; 303 /* 304 * Place the vnode onto the syncer worklist. We attempt to 305 * scatter them about on the list so that they will go off 306 * at evenly distributed times even if all the filesystems 307 * are mounted at once. 308 */ 309 next += incr; 310 if (next == 0 || next > syncer_maxdelay) { 311 start /= 2; 312 incr /= 2; 313 if (start == 0) { 314 start = syncer_maxdelay / 2; 315 incr = syncer_maxdelay; 316 } 317 next = start; 318 } 319 vn_syncer_add_to_worklist(vp, next); 320 mp->mnt_syncer = vp; 321 return (0); 322 } 323 324 /* 325 * Do a lazy sync of the filesystem. 326 */ 327 int 328 sync_fsync(v) 329 void *v; 330 { 331 struct vop_fsync_args /* { 332 struct vnode *a_vp; 333 struct ucred *a_cred; 334 int a_waitfor; 335 struct proc *a_p; 336 } */ *ap = v; 337 struct vnode *syncvp = ap->a_vp; 338 struct mount *mp = syncvp->v_mount; 339 int asyncflag; 340 341 /* 342 * We only need to do something if this is a lazy evaluation. 343 */ 344 if (ap->a_waitfor != MNT_LAZY) 345 return (0); 346 347 /* 348 * Move ourselves to the back of the sync list. 349 */ 350 vn_syncer_add_to_worklist(syncvp, syncdelay); 351 352 /* 353 * Walk the list of vnodes pushing all that are dirty and 354 * not already on the sync list. 355 */ 356 simple_lock(&mountlist_slock); 357 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, ap->a_p) == 0) { 358 asyncflag = mp->mnt_flag & MNT_ASYNC; 359 mp->mnt_flag &= ~MNT_ASYNC; 360 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, ap->a_p); 361 if (asyncflag) 362 mp->mnt_flag |= MNT_ASYNC; 363 vfs_unbusy(mp, ap->a_p); 364 } else 365 simple_unlock(&mountlist_slock); 366 367 return (0); 368 } 369 370 /* 371 * The syncer vnode is no longer needed and is being decommissioned. 372 */ 373 int 374 sync_inactive(v) 375 void *v; 376 { 377 struct vop_inactive_args /* { 378 struct vnode *a_vp; 379 struct proc *a_p; 380 } */ *ap = v; 381 382 struct vnode *vp = ap->a_vp; 383 384 if (vp->v_usecount == 0) { 385 VOP_UNLOCK(vp, 0, ap->a_p); 386 return (0); 387 } 388 vp->v_mount->mnt_syncer = NULL; 389 LIST_REMOVE(vp, v_synclist); 390 vp->v_writecount = 0; 391 vput(vp); 392 return (0); 393 } 394 395 /* 396 * Print out a syncer vnode. 397 */ 398 int 399 sync_print(v) 400 void *v; 401 402 { 403 struct vop_print_args /* { 404 struct vnode *a_vp; 405 } */ *ap = v; 406 struct vnode *vp = ap->a_vp; 407 408 printf("syncer vnode"); 409 if (vp->v_vnlock != NULL) 410 lockmgr_printinfo(vp->v_vnlock); 411 printf("\n"); 412 return (0); 413 } 414