1 /* $NetBSD: kern_physio.c,v 1.85 2007/11/06 00:42:42 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1982, 1986, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 37 */ 38 39 /*- 40 * Copyright (c) 1994 Christopher G. Demetriou 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 71 */ 72 73 #include <sys/cdefs.h> 74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.85 2007/11/06 00:42:42 ad Exp $"); 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/buf.h> 79 #include <sys/proc.h> 80 #include <sys/once.h> 81 #include <sys/workqueue.h> 82 #include <sys/kmem.h> 83 84 #include <uvm/uvm_extern.h> 85 86 ONCE_DECL(physio_initialized); 87 struct workqueue *physio_workqueue; 88 89 /* 90 * The routines implemented in this file are described in: 91 * Leffler, et al.: The Design and Implementation of the 4.3BSD 92 * UNIX Operating System (Addison Welley, 1989) 93 * on pages 231-233. 94 * 95 * The routines "getphysbuf" and "putphysbuf" steal and return a swap 96 * buffer. Leffler, et al., says that swap buffers are used to do the 97 * I/O, so raw I/O requests don't have to be single-threaded. Of course, 98 * NetBSD doesn't use "swap buffers" -- we have our own memory pool for 99 * buffer descriptors. 100 */ 101 102 /* #define PHYSIO_DEBUG */ 103 #if defined(PHYSIO_DEBUG) 104 #define DPRINTF(a) printf a 105 #else /* defined(PHYSIO_DEBUG) */ 106 #define DPRINTF(a) /* nothing */ 107 #endif /* defined(PHYSIO_DEBUG) */ 108 109 struct physio_stat { 110 int ps_running; 111 int ps_error; 112 int ps_failed; 113 off_t ps_endoffset; 114 kmutex_t ps_lock; 115 kcondvar_t ps_cv; 116 }; 117 118 /* abuse these flags of struct buf */ 119 #define B_DONTFREE B_AGE 120 121 /* 122 * allocate a buffer structure for use in physical I/O. 123 */ 124 static struct buf * 125 getphysbuf(void) 126 { 127 struct buf *bp; 128 129 bp = getiobuf(); 130 bp->b_error = 0; 131 bp->b_flags = B_BUSY; 132 return(bp); 133 } 134 135 /* 136 * get rid of a swap buffer structure which has been used in physical I/O. 137 */ 138 static void 139 putphysbuf(struct buf *bp) 140 { 141 142 if ((bp->b_flags & B_DONTFREE) != 0) { 143 return; 144 } 145 146 if (__predict_false(bp->b_flags & B_WANTED)) 147 panic("putphysbuf: private buf B_WANTED"); 148 putiobuf(bp); 149 } 150 151 static void 152 physio_done(struct work *wk, void *dummy) 153 { 154 struct buf *bp = (void *)wk; 155 size_t todo = bp->b_bufsize; 156 size_t done = bp->b_bcount - bp->b_resid; 157 struct physio_stat *ps = bp->b_private; 158 159 KASSERT(&bp->b_work == wk); 160 KASSERT(bp->b_bcount <= todo); 161 KASSERT(bp->b_resid <= bp->b_bcount); 162 KASSERT((bp->b_flags & B_PHYS) != 0); 163 KASSERT(dummy == NULL); 164 165 vunmapbuf(bp, todo); 166 uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); 167 168 mutex_enter(&ps->ps_lock); 169 if (__predict_false(done != todo)) { 170 off_t endoffset = dbtob(bp->b_blkno) + done; 171 172 /* 173 * we got an error or hit EOM. 174 * 175 * we only care about the first one. 176 * ie. the one at the lowest offset. 177 */ 178 179 KASSERT(ps->ps_endoffset != endoffset); 180 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 181 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", 182 __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, 183 bp->b_blkno, bp->b_bcount, bp->b_flags)); 184 185 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { 186 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 187 " -> %" PRIu64 "\n", 188 __func__, ps, 189 ps->ps_error, bp->b_error, 190 ps->ps_endoffset, endoffset)); 191 192 ps->ps_endoffset = endoffset; 193 ps->ps_error = bp->b_error; 194 } 195 ps->ps_failed++; 196 } else { 197 KASSERT(bp->b_error == 0); 198 } 199 200 ps->ps_running--; 201 cv_signal(&ps->ps_cv); 202 mutex_exit(&ps->ps_lock); 203 204 putphysbuf(bp); 205 } 206 207 static void 208 physio_biodone(struct buf *bp) 209 { 210 #if defined(DIAGNOSTIC) 211 struct physio_stat *ps = bp->b_private; 212 size_t todo = bp->b_bufsize; 213 214 KASSERT(ps->ps_running > 0); 215 KASSERT(bp->b_bcount <= todo); 216 KASSERT(bp->b_resid <= bp->b_bcount); 217 #endif /* defined(DIAGNOSTIC) */ 218 219 workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); 220 } 221 222 static void 223 physio_wait(struct physio_stat *ps, int n) 224 { 225 226 KASSERT(mutex_owned(&ps->ps_lock)); 227 228 while (ps->ps_running > n) 229 cv_wait(&ps->ps_cv, &ps->ps_lock); 230 } 231 232 static int 233 physio_init(void) 234 { 235 int error; 236 237 KASSERT(physio_workqueue == NULL); 238 239 error = workqueue_create(&physio_workqueue, "physiod", 240 physio_done, NULL, PRI_BIO, IPL_BIO, 0); 241 242 return error; 243 } 244 245 #define PHYSIO_CONCURRENCY 16 /* XXX tune */ 246 247 /* 248 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly 249 * from the raw device to user buffers, and bypasses the buffer cache. 250 * 251 * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. 252 */ 253 int 254 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, 255 void (*min_phys)(struct buf *), struct uio *uio) 256 { 257 struct iovec *iovp; 258 struct lwp *l = curlwp; 259 struct proc *p = l->l_proc; 260 int i, s; 261 int error; 262 struct buf *bp = NULL; 263 struct physio_stat *ps; 264 int concurrency = PHYSIO_CONCURRENCY - 1; 265 266 error = RUN_ONCE(&physio_initialized, physio_init); 267 if (__predict_false(error != 0)) { 268 return error; 269 } 270 271 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", 272 __func__, uio->uio_offset, uio->uio_resid)); 273 274 flags &= B_READ | B_WRITE; 275 276 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) 277 return ENOMEM; 278 /* ps->ps_running = 0; */ 279 /* ps->ps_error = 0; */ 280 /* ps->ps_failed = 0; */ 281 ps->ps_endoffset = -1; 282 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); 283 cv_init(&ps->ps_cv, "physio"); 284 285 /* Make sure we have a buffer, creating one if necessary. */ 286 if (obp != NULL) { 287 /* [raise the processor priority level to splbio;] */ 288 s = splbio(); 289 simple_lock(&obp->b_interlock); 290 291 /* [while the buffer is marked busy] */ 292 while (obp->b_flags & B_BUSY) { 293 /* [mark the buffer wanted] */ 294 obp->b_flags |= B_WANTED; 295 /* [wait until the buffer is available] */ 296 ltsleep(obp, PRIBIO+1, "physbuf", 0, &obp->b_interlock); 297 } 298 299 /* Mark it busy, so nobody else will use it. */ 300 obp->b_flags = B_BUSY | B_DONTFREE; 301 302 /* [lower the priority level] */ 303 simple_unlock(&obp->b_interlock); 304 splx(s); 305 306 concurrency = 0; /* see "XXXkludge" comment below */ 307 } 308 309 uvm_lwp_hold(l); 310 311 for (i = 0; i < uio->uio_iovcnt; i++) { 312 bool sync = true; 313 314 iovp = &uio->uio_iov[i]; 315 while (iovp->iov_len > 0) { 316 size_t todo; 317 vaddr_t endp; 318 319 mutex_enter(&ps->ps_lock); 320 if (ps->ps_failed != 0) { 321 goto done_locked; 322 } 323 physio_wait(ps, sync ? 0 : concurrency); 324 mutex_exit(&ps->ps_lock); 325 if (obp != NULL) { 326 /* 327 * XXXkludge 328 * some drivers use "obp" as an identifier. 329 */ 330 bp = obp; 331 } else { 332 bp = getphysbuf(); 333 } 334 bp->b_dev = dev; 335 bp->b_proc = p; 336 bp->b_private = ps; 337 bp->b_vp = NULL; 338 339 /* 340 * [mark the buffer busy for physical I/O] 341 * (i.e. set B_PHYS (because it's an I/O to user 342 * memory, and B_RAW, because B_RAW is to be 343 * "Set by physio for raw transfers.", in addition 344 * to the "busy" and read/write flag.) 345 */ 346 bp->b_flags = (bp->b_flags & B_DONTFREE) | 347 B_BUSY | B_PHYS | B_RAW | B_CALL | flags; 348 bp->b_iodone = physio_biodone; 349 350 /* [set up the buffer for a maximum-sized transfer] */ 351 bp->b_blkno = btodb(uio->uio_offset); 352 if (dbtob(bp->b_blkno) != uio->uio_offset) { 353 error = EINVAL; 354 goto done; 355 } 356 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); 357 bp->b_data = iovp->iov_base; 358 359 /* 360 * [call minphys to bound the transfer size] 361 * and remember the amount of data to transfer, 362 * for later comparison. 363 */ 364 (*min_phys)(bp); 365 todo = bp->b_bufsize = bp->b_bcount; 366 #if defined(DIAGNOSTIC) 367 if (todo > MAXPHYS) 368 panic("todo(%zu) > MAXPHYS; minphys broken", 369 todo); 370 #endif /* defined(DIAGNOSTIC) */ 371 372 sync = false; 373 endp = (vaddr_t)bp->b_data + todo; 374 if (trunc_page(endp) != endp) { 375 /* 376 * following requests can overlap. 377 * note that uvm_vslock does round_page. 378 */ 379 sync = true; 380 } 381 382 /* 383 * [lock the part of the user address space involved 384 * in the transfer] 385 * Beware vmapbuf(); it clobbers b_data and 386 * saves it in b_saveaddr. However, vunmapbuf() 387 * restores it. 388 */ 389 error = uvm_vslock(p->p_vmspace, bp->b_data, todo, 390 (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); 391 if (error) { 392 goto done; 393 } 394 vmapbuf(bp, todo); 395 396 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 397 398 mutex_enter(&ps->ps_lock); 399 ps->ps_running++; 400 mutex_exit(&ps->ps_lock); 401 402 /* [call strategy to start the transfer] */ 403 (*strategy)(bp); 404 bp = NULL; 405 406 iovp->iov_len -= todo; 407 iovp->iov_base = (char *)iovp->iov_base + todo; 408 uio->uio_offset += todo; 409 uio->uio_resid -= todo; 410 } 411 } 412 413 done: 414 mutex_enter(&ps->ps_lock); 415 done_locked: 416 physio_wait(ps, 0); 417 mutex_exit(&ps->ps_lock); 418 419 if (ps->ps_failed != 0) { 420 off_t delta; 421 422 delta = uio->uio_offset - ps->ps_endoffset; 423 KASSERT(delta > 0); 424 uio->uio_resid += delta; 425 /* uio->uio_offset = ps->ps_endoffset; */ 426 } else { 427 KASSERT(ps->ps_endoffset == -1); 428 } 429 if (bp != NULL) { 430 putphysbuf(bp); 431 } 432 if (error == 0) { 433 error = ps->ps_error; 434 } 435 mutex_destroy(&ps->ps_lock); 436 cv_destroy(&ps->ps_cv); 437 kmem_free(ps, sizeof(*ps)); 438 439 /* 440 * [clean up the state of the buffer] 441 * Remember if somebody wants it, so we can wake them up below. 442 * Also, if we had to steal it, give it back. 443 */ 444 if (obp != NULL) { 445 KASSERT((obp->b_flags & B_BUSY) != 0); 446 KASSERT((obp->b_flags & B_DONTFREE) != 0); 447 448 /* 449 * [if another process is waiting for the raw I/O buffer, 450 * wake up processes waiting to do physical I/O; 451 */ 452 s = splbio(); 453 simple_lock(&obp->b_interlock); 454 obp->b_flags &= 455 ~(B_BUSY | B_PHYS | B_RAW | B_CALL | B_DONTFREE); 456 if ((obp->b_flags & B_WANTED) != 0) { 457 obp->b_flags &= ~B_WANTED; 458 wakeup(obp); 459 } 460 simple_unlock(&obp->b_interlock); 461 splx(s); 462 } 463 uvm_lwp_rele(l); 464 465 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", 466 __func__, uio->uio_offset, uio->uio_resid)); 467 468 return error; 469 } 470 471 /* 472 * Leffler, et al., says on p. 231: 473 * "The minphys() routine is called by physio() to adjust the 474 * size of each I/O transfer before the latter is passed to 475 * the strategy routine..." 476 * 477 * so, just adjust the buffer's count accounting to MAXPHYS here, 478 * and return the new count; 479 */ 480 void 481 minphys(struct buf *bp) 482 { 483 484 if (bp->b_bcount > MAXPHYS) 485 bp->b_bcount = MAXPHYS; 486 } 487