1 /* $NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1982, 1986, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 37 */ 38 39 /*- 40 * Copyright (c) 1994 Christopher G. Demetriou 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 71 */ 72 73 #include <sys/cdefs.h> 74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $"); 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/buf.h> 79 #include <sys/proc.h> 80 #include <sys/once.h> 81 #include <sys/workqueue.h> 82 #include <sys/kmem.h> 83 84 #include <uvm/uvm_extern.h> 85 86 ONCE_DECL(physio_initialized); 87 struct workqueue *physio_workqueue; 88 89 /* 90 * The routines implemented in this file are described in: 91 * Leffler, et al.: The Design and Implementation of the 4.3BSD 92 * UNIX Operating System (Addison Welley, 1989) 93 * on pages 231-233. 94 * 95 * The routines "getphysbuf" and "putphysbuf" steal and return a swap 96 * buffer. Leffler, et al., says that swap buffers are used to do the 97 * I/O, so raw I/O requests don't have to be single-threaded. Of course, 98 * NetBSD doesn't use "swap buffers" -- we have our own memory pool for 99 * buffer descriptors. 100 */ 101 102 /* #define PHYSIO_DEBUG */ 103 #if defined(PHYSIO_DEBUG) 104 #define DPRINTF(a) printf a 105 #else /* defined(PHYSIO_DEBUG) */ 106 #define DPRINTF(a) /* nothing */ 107 #endif /* defined(PHYSIO_DEBUG) */ 108 109 struct physio_stat { 110 int ps_running; 111 int ps_error; 112 int ps_failed; 113 off_t ps_endoffset; 114 kmutex_t ps_lock; 115 kcondvar_t ps_cv; 116 }; 117 118 /* abuse these flags of struct buf */ 119 #define BC_DONTFREE BC_AGE 120 121 /* 122 * allocate a buffer structure for use in physical I/O. 123 */ 124 static struct buf * 125 getphysbuf(void) 126 { 127 struct buf *bp; 128 129 bp = getiobuf(NULL, true); 130 bp->b_error = 0; 131 bp->b_cflags = BC_BUSY; 132 return(bp); 133 } 134 135 /* 136 * get rid of a swap buffer structure which has been used in physical I/O. 137 */ 138 static void 139 putphysbuf(struct buf *bp) 140 { 141 142 if ((bp->b_cflags & BC_DONTFREE) != 0) { 143 return; 144 } 145 146 if (__predict_false(bp->b_cflags & BC_WANTED)) 147 panic("putphysbuf: private buf BC_WANTED"); 148 putiobuf(bp); 149 } 150 151 static void 152 physio_done(struct work *wk, void *dummy) 153 { 154 struct buf *bp = (void *)wk; 155 size_t todo = bp->b_bufsize; 156 size_t done = bp->b_bcount - bp->b_resid; 157 struct physio_stat *ps = bp->b_private; 158 159 KASSERT(&bp->b_work == wk); 160 KASSERT(bp->b_bcount <= todo); 161 KASSERT(bp->b_resid <= bp->b_bcount); 162 KASSERT((bp->b_flags & B_PHYS) != 0); 163 KASSERT(dummy == NULL); 164 165 vunmapbuf(bp, todo); 166 uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); 167 168 mutex_enter(&ps->ps_lock); 169 if (__predict_false(done != todo)) { 170 off_t endoffset = dbtob(bp->b_blkno) + done; 171 172 /* 173 * we got an error or hit EOM. 174 * 175 * we only care about the first one. 176 * ie. the one at the lowest offset. 177 */ 178 179 KASSERT(ps->ps_endoffset != endoffset); 180 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 181 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", 182 __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, 183 bp->b_blkno, bp->b_bcount, bp->b_flags)); 184 185 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { 186 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 187 " -> %" PRIu64 "\n", 188 __func__, ps, 189 ps->ps_error, bp->b_error, 190 ps->ps_endoffset, endoffset)); 191 192 ps->ps_endoffset = endoffset; 193 ps->ps_error = bp->b_error; 194 } 195 ps->ps_failed++; 196 } else { 197 KASSERT(bp->b_error == 0); 198 } 199 200 ps->ps_running--; 201 cv_signal(&ps->ps_cv); 202 mutex_exit(&ps->ps_lock); 203 204 putphysbuf(bp); 205 } 206 207 static void 208 physio_biodone(struct buf *bp) 209 { 210 #if defined(DIAGNOSTIC) 211 struct physio_stat *ps = bp->b_private; 212 size_t todo = bp->b_bufsize; 213 214 KASSERT(ps->ps_running > 0); 215 KASSERT(bp->b_bcount <= todo); 216 KASSERT(bp->b_resid <= bp->b_bcount); 217 #endif /* defined(DIAGNOSTIC) */ 218 219 workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); 220 } 221 222 static void 223 physio_wait(struct physio_stat *ps, int n) 224 { 225 226 KASSERT(mutex_owned(&ps->ps_lock)); 227 228 while (ps->ps_running > n) 229 cv_wait(&ps->ps_cv, &ps->ps_lock); 230 } 231 232 static int 233 physio_init(void) 234 { 235 int error; 236 237 KASSERT(physio_workqueue == NULL); 238 239 error = workqueue_create(&physio_workqueue, "physiod", 240 physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); 241 242 return error; 243 } 244 245 #define PHYSIO_CONCURRENCY 16 /* XXX tune */ 246 247 /* 248 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly 249 * from the raw device to user buffers, and bypasses the buffer cache. 250 * 251 * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. 252 */ 253 int 254 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, 255 void (*min_phys)(struct buf *), struct uio *uio) 256 { 257 struct iovec *iovp; 258 struct lwp *l = curlwp; 259 struct proc *p = l->l_proc; 260 int i, error; 261 struct buf *bp = NULL; 262 struct physio_stat *ps; 263 int concurrency = PHYSIO_CONCURRENCY - 1; 264 265 error = RUN_ONCE(&physio_initialized, physio_init); 266 if (__predict_false(error != 0)) { 267 return error; 268 } 269 270 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", 271 __func__, uio->uio_offset, uio->uio_resid)); 272 273 flags &= B_READ | B_WRITE; 274 275 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) 276 return ENOMEM; 277 /* ps->ps_running = 0; */ 278 /* ps->ps_error = 0; */ 279 /* ps->ps_failed = 0; */ 280 ps->ps_endoffset = -1; 281 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); 282 cv_init(&ps->ps_cv, "physio"); 283 284 /* Make sure we have a buffer, creating one if necessary. */ 285 if (obp != NULL) { 286 /* [raise the processor priority level to splbio;] */ 287 mutex_enter(&bufcache_lock); 288 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) 289 ; 290 /* Mark it busy, so nobody else will use it. */ 291 obp->b_cflags |= BC_DONTFREE; 292 mutex_exit(&bufcache_lock); 293 concurrency = 0; /* see "XXXkludge" comment below */ 294 } 295 296 uvm_lwp_hold(l); 297 298 for (i = 0; i < uio->uio_iovcnt; i++) { 299 bool sync = true; 300 301 iovp = &uio->uio_iov[i]; 302 while (iovp->iov_len > 0) { 303 size_t todo; 304 vaddr_t endp; 305 306 mutex_enter(&ps->ps_lock); 307 if (ps->ps_failed != 0) { 308 goto done_locked; 309 } 310 physio_wait(ps, sync ? 0 : concurrency); 311 mutex_exit(&ps->ps_lock); 312 if (obp != NULL) { 313 /* 314 * XXXkludge 315 * some drivers use "obp" as an identifier. 316 */ 317 bp = obp; 318 } else { 319 bp = getphysbuf(); 320 } 321 bp->b_dev = dev; 322 bp->b_proc = p; 323 bp->b_private = ps; 324 325 /* 326 * [mark the buffer busy for physical I/O] 327 * (i.e. set B_PHYS (because it's an I/O to user 328 * memory, and B_RAW, because B_RAW is to be 329 * "Set by physio for raw transfers.", in addition 330 * to the "busy" and read/write flag.) 331 */ 332 bp->b_oflags = 0; 333 bp->b_cflags = (bp->b_cflags & BC_DONTFREE) | BC_BUSY; 334 bp->b_flags = flags | B_PHYS | B_RAW; 335 bp->b_iodone = physio_biodone; 336 337 /* [set up the buffer for a maximum-sized transfer] */ 338 bp->b_blkno = btodb(uio->uio_offset); 339 if (dbtob(bp->b_blkno) != uio->uio_offset) { 340 error = EINVAL; 341 goto done; 342 } 343 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); 344 bp->b_data = iovp->iov_base; 345 346 /* 347 * [call minphys to bound the transfer size] 348 * and remember the amount of data to transfer, 349 * for later comparison. 350 */ 351 (*min_phys)(bp); 352 todo = bp->b_bufsize = bp->b_bcount; 353 #if defined(DIAGNOSTIC) 354 if (todo > MAXPHYS) 355 panic("todo(%zu) > MAXPHYS; minphys broken", 356 todo); 357 #endif /* defined(DIAGNOSTIC) */ 358 359 sync = false; 360 endp = (vaddr_t)bp->b_data + todo; 361 if (trunc_page(endp) != endp) { 362 /* 363 * following requests can overlap. 364 * note that uvm_vslock does round_page. 365 */ 366 sync = true; 367 } 368 369 /* 370 * [lock the part of the user address space involved 371 * in the transfer] 372 * Beware vmapbuf(); it clobbers b_data and 373 * saves it in b_saveaddr. However, vunmapbuf() 374 * restores it. 375 */ 376 error = uvm_vslock(p->p_vmspace, bp->b_data, todo, 377 (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); 378 if (error) { 379 goto done; 380 } 381 vmapbuf(bp, todo); 382 383 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 384 385 mutex_enter(&ps->ps_lock); 386 ps->ps_running++; 387 mutex_exit(&ps->ps_lock); 388 389 /* [call strategy to start the transfer] */ 390 (*strategy)(bp); 391 bp = NULL; 392 393 iovp->iov_len -= todo; 394 iovp->iov_base = (char *)iovp->iov_base + todo; 395 uio->uio_offset += todo; 396 uio->uio_resid -= todo; 397 } 398 } 399 400 done: 401 mutex_enter(&ps->ps_lock); 402 done_locked: 403 physio_wait(ps, 0); 404 mutex_exit(&ps->ps_lock); 405 406 if (ps->ps_failed != 0) { 407 off_t delta; 408 409 delta = uio->uio_offset - ps->ps_endoffset; 410 KASSERT(delta > 0); 411 uio->uio_resid += delta; 412 /* uio->uio_offset = ps->ps_endoffset; */ 413 } else { 414 KASSERT(ps->ps_endoffset == -1); 415 } 416 if (bp != NULL) { 417 putphysbuf(bp); 418 } 419 if (error == 0) { 420 error = ps->ps_error; 421 } 422 mutex_destroy(&ps->ps_lock); 423 cv_destroy(&ps->ps_cv); 424 kmem_free(ps, sizeof(*ps)); 425 426 /* 427 * [clean up the state of the buffer] 428 * Remember if somebody wants it, so we can wake them up below. 429 * Also, if we had to steal it, give it back. 430 */ 431 if (obp != NULL) { 432 KASSERT((obp->b_cflags & BC_BUSY) != 0); 433 KASSERT((obp->b_cflags & BC_DONTFREE) != 0); 434 435 /* 436 * [if another process is waiting for the raw I/O buffer, 437 * wake up processes waiting to do physical I/O; 438 */ 439 mutex_enter(&bufcache_lock); 440 obp->b_cflags &= ~(BC_DONTFREE | BC_BUSY | BC_WANTED); 441 obp->b_flags &= ~(B_PHYS | B_RAW); 442 obp->b_iodone = NULL; 443 cv_broadcast(&obp->b_busy); 444 mutex_exit(&bufcache_lock); 445 } 446 uvm_lwp_rele(l); 447 448 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", 449 __func__, uio->uio_offset, uio->uio_resid)); 450 451 return error; 452 } 453 454 /* 455 * Leffler, et al., says on p. 231: 456 * "The minphys() routine is called by physio() to adjust the 457 * size of each I/O transfer before the latter is passed to 458 * the strategy routine..." 459 * 460 * so, just adjust the buffer's count accounting to MAXPHYS here, 461 * and return the new count; 462 */ 463 void 464 minphys(struct buf *bp) 465 { 466 467 if (bp->b_bcount > MAXPHYS) 468 bp->b_bcount = MAXPHYS; 469 } 470