1 /* $NetBSD: kern_physio.c,v 1.99 2021/09/16 22:19:11 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 1982, 1986, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 37 */ 38 39 /*- 40 * Copyright (c) 1994 Christopher G. Demetriou 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 71 */ 72 73 #include <sys/cdefs.h> 74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.99 2021/09/16 22:19:11 andvar Exp $"); 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/conf.h> 79 #include <sys/buf.h> 80 #include <sys/proc.h> 81 #include <sys/once.h> 82 #include <sys/workqueue.h> 83 #include <sys/kmem.h> 84 85 #include <uvm/uvm_extern.h> 86 87 ONCE_DECL(physio_initialized); 88 struct workqueue *physio_workqueue; 89 90 int physio_concurrency = 16; 91 92 /* #define PHYSIO_DEBUG */ 93 #if defined(PHYSIO_DEBUG) 94 #define DPRINTF(a) printf a 95 #else /* defined(PHYSIO_DEBUG) */ 96 #define DPRINTF(a) /* nothing */ 97 #endif /* defined(PHYSIO_DEBUG) */ 98 99 struct physio_stat { 100 int ps_running; 101 int ps_error; 102 int ps_failed; 103 off_t ps_endoffset; 104 size_t ps_resid; 105 buf_t *ps_orig_bp; 106 kmutex_t ps_lock; 107 kcondvar_t ps_cv; 108 }; 109 110 static void 111 physio_done(struct work *wk, void *dummy) 112 { 113 struct buf *bp = (void *)wk; 114 size_t todo = bp->b_bufsize; 115 size_t done = bp->b_bcount - bp->b_resid; 116 struct physio_stat *ps = bp->b_private; 117 bool is_iobuf; 118 119 KASSERT(&bp->b_work == wk); 120 KASSERT(bp->b_bcount <= todo); 121 KASSERT(bp->b_resid <= bp->b_bcount); 122 KASSERT((bp->b_flags & B_PHYS) != 0); 123 KASSERT(dummy == NULL); 124 125 vunmapbuf(bp, todo); 126 uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); 127 128 mutex_enter(&ps->ps_lock); 129 is_iobuf = (bp != ps->ps_orig_bp); 130 if (__predict_false(done != todo)) { 131 off_t endoffset = dbtob(bp->b_blkno) + done; 132 133 /* 134 * we got an error or hit EOM. 135 * 136 * we only care about the first one. 137 * ie. the one at the lowest offset. 138 */ 139 140 KASSERT(ps->ps_endoffset != endoffset); 141 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 142 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", 143 __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, 144 bp->b_blkno, bp->b_bcount, bp->b_flags)); 145 146 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { 147 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 148 " -> %" PRIu64 "\n", 149 __func__, ps, 150 ps->ps_error, bp->b_error, 151 ps->ps_endoffset, endoffset)); 152 153 ps->ps_endoffset = endoffset; 154 ps->ps_error = bp->b_error; 155 } 156 ps->ps_failed++; 157 158 ps->ps_resid += todo - done; 159 } else { 160 KASSERT(bp->b_error == 0); 161 } 162 163 ps->ps_running--; 164 cv_signal(&ps->ps_cv); 165 mutex_exit(&ps->ps_lock); 166 167 if (is_iobuf) 168 putiobuf(bp); 169 } 170 171 static void 172 physio_biodone(struct buf *bp) 173 { 174 #if defined(DIAGNOSTIC) 175 struct physio_stat *ps = bp->b_private; 176 size_t todo = bp->b_bufsize; 177 size_t done = bp->b_bcount - bp->b_resid; 178 179 KASSERT(ps->ps_running > 0); 180 KASSERT(bp->b_bcount <= todo); 181 KASSERT(bp->b_resid <= bp->b_bcount); 182 if (done == todo) 183 KASSERT(bp->b_error == 0); 184 #endif /* defined(DIAGNOSTIC) */ 185 186 workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); 187 } 188 189 static void 190 physio_wait(struct physio_stat *ps, int n) 191 { 192 193 KASSERT(mutex_owned(&ps->ps_lock)); 194 195 while (ps->ps_running > n) 196 cv_wait(&ps->ps_cv, &ps->ps_lock); 197 } 198 199 static int 200 physio_init(void) 201 { 202 int error; 203 204 KASSERT(physio_workqueue == NULL); 205 206 error = workqueue_create(&physio_workqueue, "physiod", 207 physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); 208 209 return error; 210 } 211 212 /* 213 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly 214 * from the raw device to user buffers, and bypasses the buffer cache. 215 */ 216 int 217 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, 218 void (*min_phys)(struct buf *), struct uio *uio) 219 { 220 struct iovec *iovp; 221 struct lwp *l = curlwp; 222 struct proc *p = l->l_proc; 223 int i, error; 224 struct buf *bp = NULL; 225 struct physio_stat *ps; 226 int concurrency = physio_concurrency - 1; 227 int isdisk; 228 229 error = RUN_ONCE(&physio_initialized, physio_init); 230 if (__predict_false(error != 0)) { 231 return error; 232 } 233 234 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", 235 __func__, uio->uio_offset, uio->uio_resid)); 236 237 flags &= B_READ | B_WRITE; 238 239 ps = kmem_zalloc(sizeof(*ps), KM_SLEEP); 240 /* ps->ps_running = 0; */ 241 /* ps->ps_error = 0; */ 242 /* ps->ps_failed = 0; */ 243 ps->ps_orig_bp = obp; 244 ps->ps_endoffset = -1; 245 ps->ps_resid = 0; 246 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); 247 cv_init(&ps->ps_cv, "physio"); 248 249 /* Allow concurrent I/O only for disks */ 250 isdisk = cdev_type(dev) == D_DISK; 251 if (!isdisk) 252 concurrency = 0; 253 254 /* Make sure we have a buffer, creating one if necessary. */ 255 if (obp != NULL) { 256 mutex_enter(&bufcache_lock); 257 /* Mark it busy, so nobody else will use it. */ 258 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) 259 ; 260 mutex_exit(&bufcache_lock); 261 concurrency = 0; /* see "XXXkludge" comment below */ 262 } 263 264 for (i = 0; i < uio->uio_iovcnt; i++) { 265 bool sync = true; 266 267 iovp = &uio->uio_iov[i]; 268 while (iovp->iov_len > 0) { 269 size_t todo; 270 vaddr_t endp; 271 272 mutex_enter(&ps->ps_lock); 273 if (ps->ps_failed != 0) { 274 goto done_locked; 275 } 276 physio_wait(ps, sync ? 0 : concurrency); 277 mutex_exit(&ps->ps_lock); 278 if (obp != NULL) { 279 /* 280 * XXXkludge 281 * some drivers use "obp" as an identifier. 282 */ 283 bp = obp; 284 } else { 285 bp = getiobuf(NULL, true); 286 bp->b_cflags |= BC_BUSY; 287 } 288 bp->b_dev = dev; 289 bp->b_proc = p; 290 bp->b_private = ps; 291 292 /* 293 * Mrk the buffer busy for physical I/O. Also set 294 * B_PHYS because it's an I/O to user memory, and 295 * B_RAW because B_RAW is to be "set by physio for 296 * raw transfers". 297 */ 298 bp->b_oflags = 0; 299 bp->b_cflags |= BC_BUSY; 300 bp->b_flags = flags | B_PHYS | B_RAW; 301 bp->b_iodone = physio_biodone; 302 303 /* Set up the buffer for a maximum-sized transfer. */ 304 bp->b_blkno = btodb(uio->uio_offset); 305 if (isdisk) { 306 /* 307 * For disks, check that offsets are at least block 308 * aligned, the block addresses are used to track 309 * errors of finished requests. 310 */ 311 if (dbtob(bp->b_blkno) != uio->uio_offset) { 312 error = EINVAL; 313 goto done; 314 } 315 /* 316 * Split request into MAXPHYS chunks 317 */ 318 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); 319 } else { 320 bp->b_bcount = iovp->iov_len; 321 } 322 bp->b_data = iovp->iov_base; 323 324 /* 325 * Call minphys to bound the transfer size, 326 * and remember the amount of data to transfer, 327 * for later comparison. 328 */ 329 (*min_phys)(bp); 330 todo = bp->b_bufsize = bp->b_bcount; 331 #if defined(DIAGNOSTIC) 332 if (todo > MAXPHYS) 333 panic("todo(%zu) > MAXPHYS; minphys broken", 334 todo); 335 #endif /* defined(DIAGNOSTIC) */ 336 337 sync = false; 338 endp = (vaddr_t)bp->b_data + todo; 339 if (trunc_page(endp) != endp) { 340 /* 341 * Following requests can overlap. 342 * note that uvm_vslock does round_page. 343 */ 344 sync = true; 345 } 346 347 /* 348 * Lock the part of the user address space involved 349 * in the transfer. 350 */ 351 error = uvm_vslock(p->p_vmspace, bp->b_data, todo, 352 (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); 353 if (error) { 354 goto done; 355 } 356 357 /* 358 * Beware vmapbuf(); if successful it clobbers 359 * b_data and saves it in b_saveaddr. 360 * However, vunmapbuf() restores b_data. 361 */ 362 if ((error = vmapbuf(bp, todo)) != 0) { 363 uvm_vsunlock(p->p_vmspace, bp->b_data, todo); 364 goto done; 365 } 366 367 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 368 369 mutex_enter(&ps->ps_lock); 370 ps->ps_running++; 371 mutex_exit(&ps->ps_lock); 372 373 /* Call strategy to start the transfer. */ 374 (*strategy)(bp); 375 bp = NULL; 376 377 iovp->iov_len -= todo; 378 iovp->iov_base = (char *)iovp->iov_base + todo; 379 uio->uio_offset += todo; 380 uio->uio_resid -= todo; 381 } 382 } 383 384 done: 385 mutex_enter(&ps->ps_lock); 386 done_locked: 387 physio_wait(ps, 0); 388 mutex_exit(&ps->ps_lock); 389 390 KASSERT(ps->ps_failed || ps->ps_endoffset == -1); 391 392 /* 393 * Compute residual, for disks adjust for the 394 * lowest numbered block that returned an error. 395 */ 396 if (isdisk) { 397 if (ps->ps_failed != 0) { 398 off_t delta; 399 400 delta = uio->uio_offset - ps->ps_endoffset; 401 KASSERT(delta > 0); 402 uio->uio_resid += delta; 403 /* uio->uio_offset = ps->ps_endoffset; */ 404 } 405 } else { 406 uio->uio_resid += ps->ps_resid; 407 } 408 409 if (bp != NULL && bp != obp) { 410 putiobuf(bp); 411 } 412 if (error == 0) { 413 error = ps->ps_error; 414 } 415 mutex_destroy(&ps->ps_lock); 416 cv_destroy(&ps->ps_cv); 417 kmem_free(ps, sizeof(*ps)); 418 419 /* 420 * Clean up the state of the buffer. Remember if somebody wants 421 * it, so we can wake them up below. Also, if we had to steal it, 422 * give it back. 423 */ 424 if (obp != NULL) { 425 KASSERT((obp->b_cflags & BC_BUSY) != 0); 426 427 /* 428 * If another process is waiting for the raw I/O buffer, 429 * wake up processes waiting to do physical I/O; 430 */ 431 mutex_enter(&bufcache_lock); 432 obp->b_cflags &= ~(BC_BUSY | BC_WANTED); 433 obp->b_flags &= ~(B_PHYS | B_RAW); 434 obp->b_iodone = NULL; 435 cv_broadcast(&obp->b_busy); 436 mutex_exit(&bufcache_lock); 437 } 438 439 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", 440 __func__, uio->uio_offset, uio->uio_resid)); 441 442 return error; 443 } 444 445 /* 446 * A minphys() routine is called by physio() to adjust the size of each 447 * I/O transfer before the latter is passed to the strategy routine. 448 * 449 * This minphys() is a default that must be called to enforce limits 450 * that are applicable to all devices, because of limitations in the 451 * kernel or the hardware platform. 452 */ 453 void 454 minphys(struct buf *bp) 455 { 456 457 if (bp->b_bcount > MAXPHYS) 458 bp->b_bcount = MAXPHYS; 459 } 460