1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved. 3 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert 4 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer, 5 * All rights reserved. 6 * Copyright (c) 1982, 1986, 1991, 1993 7 * The Regents of the University of California. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/sysctl.h> 35 #include <sys/module.h> 36 #include <sys/malloc.h> 37 #include <sys/conf.h> 38 #include <sys/bio.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/queue.h> 42 #include <sys/device.h> 43 #include <sys/tree.h> 44 #include <sys/syslink_rpc.h> 45 #include <sys/proc.h> 46 #include <machine/stdarg.h> 47 #include <sys/devfs.h> 48 #include <sys/dsched.h> 49 50 #include <sys/thread2.h> 51 #include <sys/mplock2.h> 52 53 static int mpsafe_writes; 54 static int mplock_writes; 55 static int mpsafe_reads; 56 static int mplock_reads; 57 static int mpsafe_strategies; 58 static int mplock_strategies; 59 60 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes, 61 0, "mpsafe writes"); 62 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes, 63 0, "non-mpsafe writes"); 64 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads, 65 0, "mpsafe reads"); 66 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads, 67 0, "non-mpsafe reads"); 68 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies, 69 0, "mpsafe strategies"); 70 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies, 71 0, "non-mpsafe strategies"); 72 73 /* 74 * system link descriptors identify the command in the 75 * arguments structure. 76 */ 77 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc) 78 79 #define DEVOP_DESC_INIT(name) \ 80 struct syslink_desc DDESCNAME(name) = { \ 81 __offsetof(struct dev_ops, __CONCAT(d_, name)), \ 82 #name } 83 84 DEVOP_DESC_INIT(default); 85 DEVOP_DESC_INIT(open); 86 DEVOP_DESC_INIT(close); 87 DEVOP_DESC_INIT(read); 88 DEVOP_DESC_INIT(write); 89 DEVOP_DESC_INIT(ioctl); 90 DEVOP_DESC_INIT(dump); 91 DEVOP_DESC_INIT(psize); 92 DEVOP_DESC_INIT(mmap); 93 DEVOP_DESC_INIT(strategy); 94 DEVOP_DESC_INIT(kqfilter); 95 DEVOP_DESC_INIT(revoke); 96 DEVOP_DESC_INIT(clone); 97 98 /* 99 * Misc default ops 100 */ 101 struct dev_ops dead_dev_ops; 102 103 struct dev_ops default_dev_ops = { 104 { "null" }, 105 .d_default = NULL, /* must be NULL */ 106 .d_open = noopen, 107 .d_close = noclose, 108 .d_read = noread, 109 .d_write = nowrite, 110 .d_ioctl = noioctl, 111 .d_mmap = nommap, 112 .d_strategy = nostrategy, 113 .d_dump = nodump, 114 .d_psize = nopsize, 115 .d_kqfilter = nokqfilter, 116 .d_revoke = norevoke, 117 .d_clone = noclone 118 }; 119 120 static __inline 121 int 122 dev_needmplock(cdev_t dev) 123 { 124 return((dev->si_ops->head.flags & D_MPSAFE) == 0); 125 } 126 127 /************************************************************************ 128 * GENERAL DEVICE API FUNCTIONS * 129 ************************************************************************ 130 * 131 * The MPSAFEness of these depends on dev->si_ops->head.flags 132 */ 133 int 134 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred) 135 { 136 struct dev_open_args ap; 137 int needmplock = dev_needmplock(dev); 138 int error; 139 140 ap.a_head.a_desc = &dev_open_desc; 141 ap.a_head.a_dev = dev; 142 ap.a_oflags = oflags; 143 ap.a_devtype = devtype; 144 ap.a_cred = cred; 145 146 if (needmplock) 147 get_mplock(); 148 error = dev->si_ops->d_open(&ap); 149 if (needmplock) 150 rel_mplock(); 151 return (error); 152 } 153 154 int 155 dev_dclose(cdev_t dev, int fflag, int devtype) 156 { 157 struct dev_close_args ap; 158 int needmplock = dev_needmplock(dev); 159 int error; 160 161 ap.a_head.a_desc = &dev_close_desc; 162 ap.a_head.a_dev = dev; 163 ap.a_fflag = fflag; 164 ap.a_devtype = devtype; 165 166 if (needmplock) 167 get_mplock(); 168 error = dev->si_ops->d_close(&ap); 169 if (needmplock) 170 rel_mplock(); 171 return (error); 172 } 173 174 int 175 dev_dread(cdev_t dev, struct uio *uio, int ioflag) 176 { 177 struct dev_read_args ap; 178 int needmplock = dev_needmplock(dev); 179 int error; 180 181 ap.a_head.a_desc = &dev_read_desc; 182 ap.a_head.a_dev = dev; 183 ap.a_uio = uio; 184 ap.a_ioflag = ioflag; 185 186 if (needmplock) { 187 get_mplock(); 188 ++mplock_reads; 189 } else { 190 ++mpsafe_reads; 191 } 192 error = dev->si_ops->d_read(&ap); 193 if (needmplock) 194 rel_mplock(); 195 if (error == 0) 196 dev->si_lastread = time_second; 197 return (error); 198 } 199 200 int 201 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag) 202 { 203 struct dev_write_args ap; 204 int needmplock = dev_needmplock(dev); 205 int error; 206 207 dev->si_lastwrite = time_second; 208 ap.a_head.a_desc = &dev_write_desc; 209 ap.a_head.a_dev = dev; 210 ap.a_uio = uio; 211 ap.a_ioflag = ioflag; 212 213 if (needmplock) { 214 get_mplock(); 215 ++mplock_writes; 216 } else { 217 ++mpsafe_writes; 218 } 219 error = dev->si_ops->d_write(&ap); 220 if (needmplock) 221 rel_mplock(); 222 return (error); 223 } 224 225 int 226 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred, 227 struct sysmsg *msg) 228 { 229 struct dev_ioctl_args ap; 230 int needmplock = dev_needmplock(dev); 231 int error; 232 233 ap.a_head.a_desc = &dev_ioctl_desc; 234 ap.a_head.a_dev = dev; 235 ap.a_cmd = cmd; 236 ap.a_data = data; 237 ap.a_fflag = fflag; 238 ap.a_cred = cred; 239 ap.a_sysmsg = msg; 240 241 if (needmplock) 242 get_mplock(); 243 error = dev->si_ops->d_ioctl(&ap); 244 if (needmplock) 245 rel_mplock(); 246 return (error); 247 } 248 249 int 250 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot) 251 { 252 struct dev_mmap_args ap; 253 int needmplock = dev_needmplock(dev); 254 int error; 255 256 ap.a_head.a_desc = &dev_mmap_desc; 257 ap.a_head.a_dev = dev; 258 ap.a_offset = offset; 259 ap.a_nprot = nprot; 260 261 if (needmplock) 262 get_mplock(); 263 error = dev->si_ops->d_mmap(&ap); 264 if (needmplock) 265 rel_mplock(); 266 267 if (error == 0) 268 return(ap.a_result); 269 return(-1); 270 } 271 272 int 273 dev_dclone(cdev_t dev) 274 { 275 struct dev_clone_args ap; 276 int needmplock = dev_needmplock(dev); 277 int error; 278 279 ap.a_head.a_desc = &dev_clone_desc; 280 ap.a_head.a_dev = dev; 281 282 if (needmplock) 283 get_mplock(); 284 error = dev->si_ops->d_clone(&ap); 285 if (needmplock) 286 rel_mplock(); 287 return (error); 288 } 289 290 int 291 dev_drevoke(cdev_t dev) 292 { 293 struct dev_revoke_args ap; 294 int needmplock = dev_needmplock(dev); 295 int error; 296 297 ap.a_head.a_desc = &dev_revoke_desc; 298 ap.a_head.a_dev = dev; 299 300 if (needmplock) 301 get_mplock(); 302 error = dev->si_ops->d_revoke(&ap); 303 if (needmplock) 304 rel_mplock(); 305 306 return (error); 307 } 308 309 /* 310 * Core device strategy call, used to issue I/O on a device. There are 311 * two versions, a non-chained version and a chained version. The chained 312 * version reuses a BIO set up by vn_strategy(). The only difference is 313 * that, for now, we do not push a new tracking structure when chaining 314 * from vn_strategy. XXX this will ultimately have to change. 315 */ 316 void 317 dev_dstrategy(cdev_t dev, struct bio *bio) 318 { 319 struct dev_strategy_args ap; 320 struct bio_track *track; 321 int needmplock = dev_needmplock(dev); 322 323 ap.a_head.a_desc = &dev_strategy_desc; 324 ap.a_head.a_dev = dev; 325 ap.a_bio = bio; 326 327 KKASSERT(bio->bio_track == NULL); 328 KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); 329 if (bio->bio_buf->b_cmd == BUF_CMD_READ) 330 track = &dev->si_track_read; 331 else 332 track = &dev->si_track_write; 333 bio_track_ref(track); 334 bio->bio_track = track; 335 336 if (dsched_is_clear_buf_priv(bio->bio_buf)) 337 dsched_new_buf(bio->bio_buf); 338 339 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 340 if (needmplock) { 341 get_mplock(); 342 ++mplock_strategies; 343 } else { 344 ++mpsafe_strategies; 345 } 346 (void)dev->si_ops->d_strategy(&ap); 347 if (needmplock) 348 rel_mplock(); 349 } 350 351 void 352 dev_dstrategy_chain(cdev_t dev, struct bio *bio) 353 { 354 struct dev_strategy_args ap; 355 int needmplock = dev_needmplock(dev); 356 357 ap.a_head.a_desc = &dev_strategy_desc; 358 ap.a_head.a_dev = dev; 359 ap.a_bio = bio; 360 361 KKASSERT(bio->bio_track != NULL); 362 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 363 if (needmplock) 364 get_mplock(); 365 (void)dev->si_ops->d_strategy(&ap); 366 if (needmplock) 367 rel_mplock(); 368 } 369 370 /* 371 * note: the disk layer is expected to set count, blkno, and secsize before 372 * forwarding the message. 373 */ 374 int 375 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset, 376 size_t length) 377 { 378 struct dev_dump_args ap; 379 int needmplock = dev_needmplock(dev); 380 int error; 381 382 ap.a_head.a_desc = &dev_dump_desc; 383 ap.a_head.a_dev = dev; 384 ap.a_count = 0; 385 ap.a_blkno = 0; 386 ap.a_secsize = 0; 387 ap.a_virtual = virtual; 388 ap.a_physical = physical; 389 ap.a_offset = offset; 390 ap.a_length = length; 391 392 if (needmplock) 393 get_mplock(); 394 error = dev->si_ops->d_dump(&ap); 395 if (needmplock) 396 rel_mplock(); 397 return (error); 398 } 399 400 int64_t 401 dev_dpsize(cdev_t dev) 402 { 403 struct dev_psize_args ap; 404 int needmplock = dev_needmplock(dev); 405 int error; 406 407 ap.a_head.a_desc = &dev_psize_desc; 408 ap.a_head.a_dev = dev; 409 410 if (needmplock) 411 get_mplock(); 412 error = dev->si_ops->d_psize(&ap); 413 if (needmplock) 414 rel_mplock(); 415 416 if (error == 0) 417 return (ap.a_result); 418 return(-1); 419 } 420 421 /* 422 * Pass-thru to the device kqfilter. 423 * 424 * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions 425 * which return 0 do not have to bother setting a_result. 426 */ 427 int 428 dev_dkqfilter(cdev_t dev, struct knote *kn) 429 { 430 struct dev_kqfilter_args ap; 431 int needmplock = dev_needmplock(dev); 432 int error; 433 434 ap.a_head.a_desc = &dev_kqfilter_desc; 435 ap.a_head.a_dev = dev; 436 ap.a_kn = kn; 437 ap.a_result = 0; 438 439 if (needmplock) 440 get_mplock(); 441 error = dev->si_ops->d_kqfilter(&ap); 442 if (needmplock) 443 rel_mplock(); 444 445 if (error == 0) 446 return(ap.a_result); 447 else if (error == EOPNOTSUPP) 448 return(EOPNOTSUPP); 449 return(ENODEV); 450 } 451 452 /************************************************************************ 453 * DEVICE HELPER FUNCTIONS * 454 ************************************************************************/ 455 456 /* 457 * MPSAFE 458 */ 459 int 460 dev_drefs(cdev_t dev) 461 { 462 return(dev->si_sysref.refcnt); 463 } 464 465 /* 466 * MPSAFE 467 */ 468 const char * 469 dev_dname(cdev_t dev) 470 { 471 return(dev->si_ops->head.name); 472 } 473 474 /* 475 * MPSAFE 476 */ 477 int 478 dev_dflags(cdev_t dev) 479 { 480 return(dev->si_ops->head.flags); 481 } 482 483 /* 484 * MPSAFE 485 */ 486 int 487 dev_dmaj(cdev_t dev) 488 { 489 return(dev->si_ops->head.maj); 490 } 491 492 /* 493 * Used when forwarding a request through layers. The caller adjusts 494 * ap->a_head.a_dev and then calls this function. 495 */ 496 int 497 dev_doperate(struct dev_generic_args *ap) 498 { 499 int (*func)(struct dev_generic_args *); 500 int needmplock = dev_needmplock(ap->a_dev); 501 int error; 502 503 func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset); 504 505 if (needmplock) 506 get_mplock(); 507 error = func(ap); 508 if (needmplock) 509 rel_mplock(); 510 511 return (error); 512 } 513 514 /* 515 * Used by the console intercept code only. Issue an operation through 516 * a foreign ops structure allowing the ops structure associated 517 * with the device to remain intact. 518 */ 519 int 520 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap) 521 { 522 int (*func)(struct dev_generic_args *); 523 int needmplock = ((ops->head.flags & D_MPSAFE) == 0); 524 int error; 525 526 func = *(void **)((char *)ops + ap->a_desc->sd_offset); 527 528 if (needmplock) 529 get_mplock(); 530 error = func(ap); 531 if (needmplock) 532 rel_mplock(); 533 534 return (error); 535 } 536 537 /* 538 * Convert a template dev_ops into the real thing by filling in 539 * uninitialized fields. 540 */ 541 void 542 compile_dev_ops(struct dev_ops *ops) 543 { 544 int offset; 545 546 for (offset = offsetof(struct dev_ops, dev_ops_first_field); 547 offset <= offsetof(struct dev_ops, dev_ops_last_field); 548 offset += sizeof(void *) 549 ) { 550 void **func_p = (void **)((char *)ops + offset); 551 void **def_p = (void **)((char *)&default_dev_ops + offset); 552 if (*func_p == NULL) { 553 if (ops->d_default) 554 *func_p = ops->d_default; 555 else 556 *func_p = *def_p; 557 } 558 } 559 } 560 561 /************************************************************************ 562 * MAJOR/MINOR SPACE FUNCTION * 563 ************************************************************************/ 564 565 /* 566 * This makes a dev_ops entry visible to userland (e.g /dev/<blah>). 567 * 568 * Disk devices typically register their major, e.g. 'ad0', and then call 569 * into the disk label management code which overloads its own onto e.g. 'ad0' 570 * to support all the various slice and partition combinations. 571 * 572 * The mask/match supplied in this call are a full 32 bits and the same 573 * mask and match must be specified in a later dev_ops_remove() call to 574 * match this add. However, the match value for the minor number should never 575 * have any bits set in the major number's bit range (8-15). The mask value 576 * may be conveniently specified as -1 without creating any major number 577 * interference. 578 */ 579 580 static 581 int 582 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b) 583 { 584 if (a->maj < b->maj) 585 return(-1); 586 else if (a->maj > b->maj) 587 return(1); 588 return(0); 589 } 590 591 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj); 592 593 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead); 594 595 int 596 dev_ops_remove_all(struct dev_ops *ops) 597 { 598 return devfs_destroy_dev_by_ops(ops, -1); 599 } 600 601 int 602 dev_ops_remove_minor(struct dev_ops *ops, int minor) 603 { 604 return devfs_destroy_dev_by_ops(ops, minor); 605 } 606 607 struct dev_ops * 608 dev_ops_intercept(cdev_t dev, struct dev_ops *iops) 609 { 610 struct dev_ops *oops = dev->si_ops; 611 612 compile_dev_ops(iops); 613 iops->head.maj = oops->head.maj; 614 iops->head.data = oops->head.data; 615 iops->head.flags = oops->head.flags; 616 dev->si_ops = iops; 617 dev->si_flags |= SI_INTERCEPTED; 618 619 return (oops); 620 } 621 622 void 623 dev_ops_restore(cdev_t dev, struct dev_ops *oops) 624 { 625 struct dev_ops *iops = dev->si_ops; 626 627 dev->si_ops = oops; 628 dev->si_flags &= ~SI_INTERCEPTED; 629 iops->head.maj = 0; 630 iops->head.data = NULL; 631 iops->head.flags = 0; 632 } 633 634 /************************************************************************ 635 * DEFAULT DEV OPS FUNCTIONS * 636 ************************************************************************/ 637 638 639 /* 640 * Unsupported devswitch functions (e.g. for writing to read-only device). 641 * XXX may belong elsewhere. 642 */ 643 int 644 norevoke(struct dev_revoke_args *ap) 645 { 646 /* take no action */ 647 return(0); 648 } 649 650 int 651 noclone(struct dev_clone_args *ap) 652 { 653 /* take no action */ 654 return (0); /* allow the clone */ 655 } 656 657 int 658 noopen(struct dev_open_args *ap) 659 { 660 return (ENODEV); 661 } 662 663 int 664 noclose(struct dev_close_args *ap) 665 { 666 return (ENODEV); 667 } 668 669 int 670 noread(struct dev_read_args *ap) 671 { 672 return (ENODEV); 673 } 674 675 int 676 nowrite(struct dev_write_args *ap) 677 { 678 return (ENODEV); 679 } 680 681 int 682 noioctl(struct dev_ioctl_args *ap) 683 { 684 return (ENODEV); 685 } 686 687 int 688 nokqfilter(struct dev_kqfilter_args *ap) 689 { 690 return (ENODEV); 691 } 692 693 int 694 nommap(struct dev_mmap_args *ap) 695 { 696 return (ENODEV); 697 } 698 699 int 700 nostrategy(struct dev_strategy_args *ap) 701 { 702 struct bio *bio = ap->a_bio; 703 704 bio->bio_buf->b_flags |= B_ERROR; 705 bio->bio_buf->b_error = EOPNOTSUPP; 706 biodone(bio); 707 return(0); 708 } 709 710 int 711 nopsize(struct dev_psize_args *ap) 712 { 713 ap->a_result = 0; 714 return(0); 715 } 716 717 int 718 nodump(struct dev_dump_args *ap) 719 { 720 return (ENODEV); 721 } 722 723 /* 724 * XXX this is probably bogus. Any device that uses it isn't checking the 725 * minor number. 726 */ 727 int 728 nullopen(struct dev_open_args *ap) 729 { 730 return (0); 731 } 732 733 int 734 nullclose(struct dev_close_args *ap) 735 { 736 return (0); 737 } 738 739