1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2009 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Alex Hornung <ahornung@gmail.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/time.h> 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/fcntl.h> 42 #include <sys/proc.h> 43 #include <sys/caps.h> 44 #include <sys/signalvar.h> 45 #include <sys/vnode.h> 46 #include <sys/uio.h> 47 #include <sys/mount.h> 48 #include <sys/file.h> 49 #include <sys/dirent.h> 50 #include <sys/malloc.h> 51 #include <sys/stat.h> 52 #include <sys/reg.h> 53 #include <vm/vm_pager.h> 54 #include <vm/vm_zone.h> 55 #include <vm/vm_object.h> 56 #include <sys/filio.h> 57 #include <sys/ttycom.h> 58 #include <sys/tty.h> 59 #include <sys/diskslice.h> 60 #include <sys/sysctl.h> 61 #include <sys/devfs.h> 62 #include <sys/pioctl.h> 63 #include <vfs/fifofs/fifo.h> 64 65 #include <machine/limits.h> 66 67 #include <sys/buf2.h> 68 #include <vm/vm_page2.h> 69 70 #ifndef SPEC_CHAIN_DEBUG 71 #define SPEC_CHAIN_DEBUG 0 72 #endif 73 74 MALLOC_DECLARE(M_DEVFS); 75 #define DEVFS_BADOP (void *)devfs_vop_badop 76 77 static int devfs_vop_badop(struct vop_generic_args *); 78 static int devfs_vop_access(struct vop_access_args *); 79 static int devfs_vop_inactive(struct vop_inactive_args *); 80 static int devfs_vop_reclaim(struct vop_reclaim_args *); 81 static int devfs_vop_readdir(struct vop_readdir_args *); 82 static int devfs_vop_getattr(struct vop_getattr_args *); 83 static int devfs_vop_setattr(struct vop_setattr_args *); 84 static int devfs_vop_readlink(struct vop_readlink_args *); 85 static int devfs_vop_print(struct vop_print_args *); 86 87 static int devfs_vop_nresolve(struct vop_nresolve_args *); 88 static int devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 89 static int devfs_vop_nmkdir(struct vop_nmkdir_args *); 90 static int devfs_vop_nsymlink(struct vop_nsymlink_args *); 91 static int devfs_vop_nrmdir(struct vop_nrmdir_args *); 92 static int devfs_vop_nremove(struct vop_nremove_args *); 93 94 static int devfs_spec_open(struct vop_open_args *); 95 static int devfs_spec_close(struct vop_close_args *); 96 static int devfs_spec_fsync(struct vop_fsync_args *); 97 98 static int devfs_spec_read(struct vop_read_args *); 99 static int devfs_spec_write(struct vop_write_args *); 100 static int devfs_spec_ioctl(struct vop_ioctl_args *); 101 static int devfs_spec_kqfilter(struct vop_kqfilter_args *); 102 static int devfs_spec_strategy(struct vop_strategy_args *); 103 static void devfs_spec_strategy_done(struct bio *); 104 static int devfs_spec_freeblks(struct vop_freeblks_args *); 105 static int devfs_spec_bmap(struct vop_bmap_args *); 106 static int devfs_spec_advlock(struct vop_advlock_args *); 107 static void devfs_spec_getpages_iodone(struct bio *); 108 static int devfs_spec_getpages(struct vop_getpages_args *); 109 110 static int devfs_fo_close(struct file *); 111 static int devfs_fo_read(struct file *, struct uio *, struct ucred *, int); 112 static int devfs_fo_write(struct file *, struct uio *, struct ucred *, int); 113 static int devfs_fo_stat(struct file *, struct stat *, struct ucred *); 114 static int devfs_fo_kqfilter(struct file *, struct knote *); 115 static int devfs_fo_ioctl(struct file *, u_long, caddr_t, 116 struct ucred *, struct sysmsg *); 117 static int devfs_fo_seek(struct file *, off_t, int, off_t *); 118 static __inline int sequential_heuristic(struct uio *, struct file *); 119 120 extern struct lock devfs_lock; 121 122 /* 123 * devfs vnode operations for regular files. All vnode ops are MPSAFE. 124 */ 125 struct vop_ops devfs_vnode_norm_vops = { 126 .vop_default = vop_defaultop, 127 .vop_access = devfs_vop_access, 128 .vop_advlock = DEVFS_BADOP, 129 .vop_bmap = DEVFS_BADOP, 130 .vop_close = vop_stdclose, 131 .vop_getattr = devfs_vop_getattr, 132 .vop_inactive = devfs_vop_inactive, 133 .vop_ncreate = DEVFS_BADOP, 134 .vop_nresolve = devfs_vop_nresolve, 135 .vop_nlookupdotdot = devfs_vop_nlookupdotdot, 136 .vop_nlink = DEVFS_BADOP, 137 .vop_nmkdir = devfs_vop_nmkdir, 138 .vop_nmknod = DEVFS_BADOP, 139 .vop_nremove = devfs_vop_nremove, 140 .vop_nrename = DEVFS_BADOP, 141 .vop_nrmdir = devfs_vop_nrmdir, 142 .vop_nsymlink = devfs_vop_nsymlink, 143 .vop_open = vop_stdopen, 144 .vop_pathconf = vop_stdpathconf, 145 .vop_print = devfs_vop_print, 146 .vop_read = DEVFS_BADOP, 147 .vop_readdir = devfs_vop_readdir, 148 .vop_readlink = devfs_vop_readlink, 149 .vop_reallocblks = DEVFS_BADOP, 150 .vop_reclaim = devfs_vop_reclaim, 151 .vop_setattr = devfs_vop_setattr, 152 .vop_write = DEVFS_BADOP, 153 .vop_ioctl = DEVFS_BADOP 154 }; 155 156 /* 157 * devfs vnode operations for character devices. All vnode ops are MPSAFE. 158 */ 159 struct vop_ops devfs_vnode_dev_vops = { 160 .vop_default = vop_defaultop, 161 .vop_access = devfs_vop_access, 162 .vop_advlock = devfs_spec_advlock, 163 .vop_bmap = devfs_spec_bmap, 164 .vop_close = devfs_spec_close, 165 .vop_freeblks = devfs_spec_freeblks, 166 .vop_fsync = devfs_spec_fsync, 167 .vop_getattr = devfs_vop_getattr, 168 .vop_getpages = devfs_spec_getpages, 169 .vop_inactive = devfs_vop_inactive, 170 .vop_open = devfs_spec_open, 171 .vop_pathconf = vop_stdpathconf, 172 .vop_print = devfs_vop_print, 173 .vop_kqfilter = devfs_spec_kqfilter, 174 .vop_read = devfs_spec_read, 175 .vop_readdir = DEVFS_BADOP, 176 .vop_readlink = DEVFS_BADOP, 177 .vop_reallocblks = DEVFS_BADOP, 178 .vop_reclaim = devfs_vop_reclaim, 179 .vop_setattr = devfs_vop_setattr, 180 .vop_strategy = devfs_spec_strategy, 181 .vop_write = devfs_spec_write, 182 .vop_ioctl = devfs_spec_ioctl 183 }; 184 185 /* 186 * devfs file pointer operations. All fileops are MPSAFE. 187 */ 188 struct vop_ops *devfs_vnode_dev_vops_p = &devfs_vnode_dev_vops; 189 190 struct fileops devfs_dev_fileops = { 191 .fo_read = devfs_fo_read, 192 .fo_write = devfs_fo_write, 193 .fo_ioctl = devfs_fo_ioctl, 194 .fo_kqfilter = devfs_fo_kqfilter, 195 .fo_stat = devfs_fo_stat, 196 .fo_close = devfs_fo_close, 197 .fo_shutdown = nofo_shutdown, 198 .fo_seek = devfs_fo_seek 199 }; 200 201 /* 202 * These two functions are possibly temporary hacks for devices (aka 203 * the pty code) which want to control the node attributes themselves. 204 * 205 * XXX we may ultimately desire to simply remove the uid/gid/mode 206 * from the node entirely. 207 * 208 * MPSAFE - sorta. Theoretically the overwrite can compete since they 209 * are loading from the same fields. 210 */ 211 static __inline void 212 node_sync_dev_get(struct devfs_node *node) 213 { 214 cdev_t dev; 215 216 if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) { 217 node->uid = dev->si_uid; 218 node->gid = dev->si_gid; 219 node->mode = dev->si_perms; 220 } 221 } 222 223 static __inline void 224 node_sync_dev_set(struct devfs_node *node) 225 { 226 cdev_t dev; 227 228 if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) { 229 dev->si_uid = node->uid; 230 dev->si_gid = node->gid; 231 dev->si_perms = node->mode; 232 } 233 } 234 235 /* 236 * generic entry point for unsupported operations 237 */ 238 static int 239 devfs_vop_badop(struct vop_generic_args *ap) 240 { 241 return (EIO); 242 } 243 244 245 static int 246 devfs_vop_access(struct vop_access_args *ap) 247 { 248 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 249 int error; 250 251 if (!devfs_node_is_accessible(node)) 252 return ENOENT; 253 node_sync_dev_get(node); 254 error = vop_helper_access(ap, node->uid, node->gid, 255 node->mode, node->flags); 256 257 return error; 258 } 259 260 261 static int 262 devfs_vop_inactive(struct vop_inactive_args *ap) 263 { 264 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 265 266 if (node == NULL || (node->flags & DEVFS_NODE_LINKED) == 0) 267 vrecycle(ap->a_vp); 268 return 0; 269 } 270 271 272 static int 273 devfs_vop_reclaim(struct vop_reclaim_args *ap) 274 { 275 struct devfs_node *node; 276 struct vnode *vp; 277 int locked; 278 279 /* 280 * Check if it is locked already. if not, we acquire the devfs lock 281 */ 282 if ((lockstatus(&devfs_lock, curthread)) != LK_EXCLUSIVE) { 283 lockmgr(&devfs_lock, LK_EXCLUSIVE); 284 locked = 1; 285 } else { 286 locked = 0; 287 } 288 289 /* 290 * Get rid of the devfs_node if it is no longer linked into the 291 * topology. Interlocked by devfs_lock. However, be careful 292 * interposing other operations between cleaning out v_data and 293 * devfs_freep() as the node is only protected by devfs_lock 294 * once the vnode is disassociated. 295 */ 296 vp = ap->a_vp; 297 node = DEVFS_NODE(vp); 298 299 if (node) { 300 if (node->v_node != vp) { 301 kprintf("NODE->V_NODE MISMATCH VP=%p NODEVP=%p\n", 302 vp, node->v_node); 303 } 304 vp->v_data = NULL; 305 node->v_node = NULL; 306 if ((node->flags & DEVFS_NODE_LINKED) == 0) 307 devfs_freep(node); 308 } 309 v_release_rdev(vp); 310 311 if (locked) 312 lockmgr(&devfs_lock, LK_RELEASE); 313 314 /* 315 * v_rdev needs to be properly released using v_release_rdev 316 * Make sure v_data is NULL as well. 317 */ 318 return 0; 319 } 320 321 322 static int 323 devfs_vop_readdir(struct vop_readdir_args *ap) 324 { 325 struct devfs_node *dnode = DEVFS_NODE(ap->a_vp); 326 struct devfs_node *node; 327 int cookie_index; 328 int ncookies; 329 int error2; 330 int error; 331 int r; 332 off_t *cookies; 333 off_t saveoff; 334 335 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_readdir() called!\n"); 336 337 if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX) 338 return (EINVAL); 339 error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM); 340 if (error) 341 return (error); 342 343 if (!devfs_node_is_accessible(dnode)) { 344 vn_unlock(ap->a_vp); 345 return ENOENT; 346 } 347 348 lockmgr(&devfs_lock, LK_EXCLUSIVE); 349 350 saveoff = ap->a_uio->uio_offset; 351 352 if (ap->a_ncookies) { 353 ncookies = ap->a_uio->uio_resid / 16 + 1; /* Why / 16 ?? */ 354 if (ncookies > 256) 355 ncookies = 256; 356 cookies = kmalloc(256 * sizeof(off_t), M_TEMP, M_WAITOK); 357 cookie_index = 0; 358 } else { 359 ncookies = -1; 360 cookies = NULL; 361 cookie_index = 0; 362 } 363 364 vfs_timestamp(&dnode->atime); 365 366 if (saveoff == 0) { 367 r = vop_write_dirent(&error, ap->a_uio, dnode->d_dir.d_ino, 368 DT_DIR, 1, "."); 369 if (r) 370 goto done; 371 if (cookies) 372 cookies[cookie_index] = saveoff; 373 saveoff++; 374 cookie_index++; 375 if (cookie_index == ncookies) 376 goto done; 377 } 378 379 if (saveoff == 1) { 380 if (dnode->parent) { 381 r = vop_write_dirent(&error, ap->a_uio, 382 dnode->parent->d_dir.d_ino, 383 DT_DIR, 2, ".."); 384 } else { 385 r = vop_write_dirent(&error, ap->a_uio, 386 dnode->d_dir.d_ino, 387 DT_DIR, 2, ".."); 388 } 389 if (r) 390 goto done; 391 if (cookies) 392 cookies[cookie_index] = saveoff; 393 saveoff++; 394 cookie_index++; 395 if (cookie_index == ncookies) 396 goto done; 397 } 398 399 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 400 if ((node->flags & DEVFS_HIDDEN) || 401 (node->flags & DEVFS_INVISIBLE)) { 402 continue; 403 } 404 405 /* 406 * If the node type is a valid devfs alias, then we make 407 * sure that the target isn't hidden. If it is, we don't 408 * show the link in the directory listing. 409 */ 410 if ((node->node_type == Nlink) && (node->link_target != NULL) && 411 (node->link_target->flags & DEVFS_HIDDEN)) 412 continue; 413 414 if (node->cookie < saveoff) 415 continue; 416 417 saveoff = node->cookie; 418 419 error2 = vop_write_dirent(&error, ap->a_uio, node->d_dir.d_ino, 420 node->d_dir.d_type, 421 node->d_dir.d_namlen, 422 node->d_dir.d_name); 423 424 if (error2) 425 break; 426 427 saveoff++; 428 429 if (cookies) 430 cookies[cookie_index] = node->cookie; 431 ++cookie_index; 432 if (cookie_index == ncookies) 433 break; 434 } 435 436 done: 437 lockmgr(&devfs_lock, LK_RELEASE); 438 vn_unlock(ap->a_vp); 439 440 ap->a_uio->uio_offset = saveoff; 441 if (error && cookie_index == 0) { 442 if (cookies) { 443 kfree(cookies, M_TEMP); 444 *ap->a_ncookies = 0; 445 *ap->a_cookies = NULL; 446 } 447 } else { 448 if (cookies) { 449 *ap->a_ncookies = cookie_index; 450 *ap->a_cookies = cookies; 451 } 452 } 453 return (error); 454 } 455 456 457 static int 458 devfs_vop_nresolve(struct vop_nresolve_args *ap) 459 { 460 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 461 struct devfs_node *node, *found = NULL; 462 struct namecache *ncp; 463 struct vnode *vp = NULL; 464 int error = 0; 465 int len; 466 int depth; 467 468 ncp = ap->a_nch->ncp; 469 len = ncp->nc_nlen; 470 471 if (!devfs_node_is_accessible(dnode)) 472 return ENOENT; 473 474 lockmgr(&devfs_lock, LK_EXCLUSIVE); 475 476 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) { 477 error = ENOENT; 478 cache_setvp(ap->a_nch, NULL); 479 goto out; 480 } 481 482 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 483 if (len == node->d_dir.d_namlen) { 484 if (!memcmp(ncp->nc_name, node->d_dir.d_name, len)) { 485 found = node; 486 break; 487 } 488 } 489 } 490 491 if (found) { 492 depth = 0; 493 while ((found->node_type == Nlink) && (found->link_target)) { 494 if (depth >= 8) { 495 devfs_debug(DEVFS_DEBUG_SHOW, "Recursive link or depth >= 8"); 496 break; 497 } 498 499 found = found->link_target; 500 ++depth; 501 } 502 503 if (!(found->flags & DEVFS_HIDDEN)) 504 devfs_allocv(/*ap->a_dvp->v_mount, */ &vp, found); 505 } 506 507 if (vp == NULL) { 508 error = ENOENT; 509 cache_setvp(ap->a_nch, NULL); 510 goto out; 511 512 } 513 KKASSERT(vp); 514 vn_unlock(vp); 515 cache_setvp(ap->a_nch, vp); 516 vrele(vp); 517 out: 518 lockmgr(&devfs_lock, LK_RELEASE); 519 520 return error; 521 } 522 523 524 static int 525 devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 526 { 527 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 528 529 *ap->a_vpp = NULL; 530 if (!devfs_node_is_accessible(dnode)) 531 return ENOENT; 532 533 lockmgr(&devfs_lock, LK_EXCLUSIVE); 534 if (dnode->parent != NULL) { 535 devfs_allocv(ap->a_vpp, dnode->parent); 536 vn_unlock(*ap->a_vpp); 537 } 538 lockmgr(&devfs_lock, LK_RELEASE); 539 540 return ((*ap->a_vpp == NULL) ? ENOENT : 0); 541 } 542 543 544 /* 545 * getattr() - Does not need a lock since the vp is refd 546 */ 547 static int 548 devfs_vop_getattr(struct vop_getattr_args *ap) 549 { 550 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 551 struct vattr *vap = ap->a_vap; 552 struct partinfo pinfo; 553 int error = 0; 554 555 #if 0 556 if (!devfs_node_is_accessible(node)) 557 return ENOENT; 558 #endif 559 560 /* 561 * XXX This is a temporary hack to prevent crashes when the device is 562 * being destroyed (and so the underlying node will be gone) while 563 * a userland program is blocked in a read(). 564 */ 565 if (node == NULL) 566 return EIO; 567 568 node_sync_dev_get(node); 569 570 /* start by zeroing out the attributes */ 571 VATTR_NULL(vap); 572 573 /* next do all the common fields */ 574 vap->va_type = ap->a_vp->v_type; 575 vap->va_mode = node->mode; 576 vap->va_fileid = DEVFS_NODE(ap->a_vp)->d_dir.d_ino ; 577 vap->va_flags = 0; 578 vap->va_blocksize = DEV_BSIZE; 579 vap->va_bytes = vap->va_size = 0; 580 581 vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; 582 583 vap->va_atime = node->atime; 584 vap->va_mtime = node->mtime; 585 vap->va_ctime = node->ctime; 586 587 vap->va_nlink = 1; /* number of references to file */ 588 589 vap->va_uid = node->uid; 590 vap->va_gid = node->gid; 591 592 vap->va_rmajor = 0; 593 vap->va_rminor = 0; 594 595 if ((node->node_type == Ndev) && node->d_dev) { 596 reference_dev(node->d_dev); 597 vap->va_rminor = node->d_dev->si_uminor; 598 release_dev(node->d_dev); 599 } 600 601 /* For a softlink the va_size is the length of the softlink */ 602 if (node->symlink_name != 0) { 603 vap->va_bytes = vap->va_size = node->symlink_namelen; 604 } 605 606 /* 607 * For a disk-type device, va_size is the size of the underlying 608 * device, so that lseek() works properly. 609 */ 610 if ((node->d_dev) && (dev_dflags(node->d_dev) & D_DISK)) { 611 bzero(&pinfo, sizeof(pinfo)); 612 error = dev_dioctl(node->d_dev, DIOCGPART, (void *)&pinfo, 613 0, proc0.p_ucred, NULL, NULL); 614 if ((error == 0) && (pinfo.media_blksize != 0)) { 615 vap->va_size = pinfo.media_size; 616 } else { 617 vap->va_size = 0; 618 error = 0; 619 } 620 } 621 622 return (error); 623 } 624 625 static int 626 devfs_vop_setattr(struct vop_setattr_args *ap) 627 { 628 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 629 struct vattr *vap; 630 uid_t cur_uid; 631 gid_t cur_gid; 632 mode_t cur_mode; 633 int error = 0; 634 635 if (!devfs_node_is_accessible(node)) 636 return ENOENT; 637 node_sync_dev_get(node); 638 639 vap = ap->a_vap; 640 641 if ((vap->va_uid != (uid_t)VNOVAL) || (vap->va_gid != (gid_t)VNOVAL)) { 642 cur_uid = node->uid; 643 cur_gid = node->gid; 644 cur_mode = node->mode; 645 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 646 ap->a_cred, &cur_uid, &cur_gid, &cur_mode); 647 if (error) 648 goto out; 649 650 if (node->uid != cur_uid || node->gid != cur_gid) { 651 node->uid = cur_uid; 652 node->gid = cur_gid; 653 node->mode = cur_mode; 654 } 655 } 656 657 if (vap->va_mode != (mode_t)VNOVAL) { 658 cur_mode = node->mode; 659 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 660 node->uid, node->gid, &cur_mode); 661 if (error == 0 && node->mode != cur_mode) { 662 node->mode = cur_mode; 663 } 664 } 665 666 out: 667 node_sync_dev_set(node); 668 vfs_timestamp(&node->ctime); 669 670 return error; 671 } 672 673 674 static int 675 devfs_vop_readlink(struct vop_readlink_args *ap) 676 { 677 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 678 int ret; 679 680 if (!devfs_node_is_accessible(node)) 681 return ENOENT; 682 683 lockmgr(&devfs_lock, LK_SHARED); 684 ret = uiomove(node->symlink_name, node->symlink_namelen, ap->a_uio); 685 lockmgr(&devfs_lock, LK_RELEASE); 686 687 return ret; 688 } 689 690 691 static int 692 devfs_vop_print(struct vop_print_args *ap) 693 { 694 return (0); 695 } 696 697 static int 698 devfs_vop_nmkdir(struct vop_nmkdir_args *ap) 699 { 700 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 701 struct devfs_node *node; 702 703 if (!devfs_node_is_accessible(dnode)) 704 return ENOENT; 705 706 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 707 goto out; 708 709 lockmgr(&devfs_lock, LK_EXCLUSIVE); 710 devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Ndir, 711 ap->a_nch->ncp->nc_name, dnode, NULL); 712 713 if (*ap->a_vpp) { 714 node = DEVFS_NODE(*ap->a_vpp); 715 node->flags |= DEVFS_USER_CREATED; 716 cache_setunresolved(ap->a_nch); 717 cache_setvp(ap->a_nch, *ap->a_vpp); 718 } 719 lockmgr(&devfs_lock, LK_RELEASE); 720 out: 721 return ((*ap->a_vpp == NULL) ? ENOTDIR : 0); 722 } 723 724 static int 725 devfs_vop_nsymlink(struct vop_nsymlink_args *ap) 726 { 727 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 728 struct devfs_node *node; 729 size_t targetlen; 730 731 if (!devfs_node_is_accessible(dnode)) 732 return ENOENT; 733 734 ap->a_vap->va_type = VLNK; 735 736 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 737 goto out; 738 739 lockmgr(&devfs_lock, LK_EXCLUSIVE); 740 devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Nlink, 741 ap->a_nch->ncp->nc_name, dnode, NULL); 742 743 targetlen = strlen(ap->a_target); 744 if (*ap->a_vpp) { 745 node = DEVFS_NODE(*ap->a_vpp); 746 node->flags |= DEVFS_USER_CREATED; 747 node->symlink_namelen = targetlen; 748 node->symlink_name = kmalloc(targetlen + 1, M_DEVFS, M_WAITOK); 749 memcpy(node->symlink_name, ap->a_target, targetlen); 750 node->symlink_name[targetlen] = '\0'; 751 cache_setunresolved(ap->a_nch); 752 cache_setvp(ap->a_nch, *ap->a_vpp); 753 } 754 lockmgr(&devfs_lock, LK_RELEASE); 755 out: 756 return ((*ap->a_vpp == NULL) ? ENOTDIR : 0); 757 } 758 759 static int 760 devfs_vop_nrmdir(struct vop_nrmdir_args *ap) 761 { 762 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 763 struct devfs_node *node; 764 struct namecache *ncp; 765 int error = ENOENT; 766 767 ncp = ap->a_nch->ncp; 768 769 if (!devfs_node_is_accessible(dnode)) 770 return ENOENT; 771 772 lockmgr(&devfs_lock, LK_EXCLUSIVE); 773 774 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 775 goto out; 776 777 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 778 if (ncp->nc_nlen != node->d_dir.d_namlen) 779 continue; 780 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen)) 781 continue; 782 783 /* 784 * only allow removal of user created dirs 785 */ 786 if ((node->flags & DEVFS_USER_CREATED) == 0) { 787 error = EPERM; 788 goto out; 789 } else if (node->node_type != Ndir) { 790 error = ENOTDIR; 791 goto out; 792 } else if (node->nchildren > 2) { 793 error = ENOTEMPTY; 794 goto out; 795 } else { 796 if (node->v_node) 797 cache_inval_vp(node->v_node, CINV_DESTROY); 798 devfs_unlinkp(node); 799 error = 0; 800 break; 801 } 802 } 803 804 cache_unlink(ap->a_nch); 805 out: 806 lockmgr(&devfs_lock, LK_RELEASE); 807 return error; 808 } 809 810 static int 811 devfs_vop_nremove(struct vop_nremove_args *ap) 812 { 813 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 814 struct devfs_node *node; 815 struct namecache *ncp; 816 int error = ENOENT; 817 818 ncp = ap->a_nch->ncp; 819 820 if (!devfs_node_is_accessible(dnode)) 821 return ENOENT; 822 823 lockmgr(&devfs_lock, LK_EXCLUSIVE); 824 825 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 826 goto out; 827 828 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 829 if (ncp->nc_nlen != node->d_dir.d_namlen) 830 continue; 831 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen)) 832 continue; 833 834 /* 835 * only allow removal of user created stuff (e.g. symlinks) 836 */ 837 if ((node->flags & DEVFS_USER_CREATED) == 0) { 838 error = EPERM; 839 goto out; 840 } else if (node->node_type == Ndir) { 841 error = EISDIR; 842 goto out; 843 } else { 844 if (node->v_node) 845 cache_inval_vp(node->v_node, CINV_DESTROY); 846 devfs_unlinkp(node); 847 error = 0; 848 break; 849 } 850 } 851 852 cache_unlink(ap->a_nch); 853 out: 854 lockmgr(&devfs_lock, LK_RELEASE); 855 return error; 856 } 857 858 859 static int 860 devfs_spec_open(struct vop_open_args *ap) 861 { 862 struct vnode *vp = ap->a_vp; 863 struct vnode *orig_vp = NULL; 864 struct devfs_node *node = DEVFS_NODE(vp); 865 struct devfs_node *newnode; 866 cdev_t dev, ndev = NULL; 867 int error = 0; 868 869 if (node) { 870 if (node->d_dev == NULL) 871 return ENXIO; 872 if (!devfs_node_is_accessible(node)) 873 return ENOENT; 874 } 875 876 if ((dev = vp->v_rdev) == NULL) 877 return ENXIO; 878 879 /* 880 * Simple devices that don't care. Retain the shared lock. 881 */ 882 if (dev_dflags(dev) & D_QUICK) { 883 vn_unlock(vp); 884 error = dev_dopen(dev, ap->a_mode, S_IFCHR, 885 ap->a_cred, ap->a_fpp, vp); 886 vn_lock(vp, LK_SHARED | LK_RETRY); 887 if (error) 888 return error; 889 vop_stdopen(ap); 890 goto skip; 891 } 892 893 /* 894 * Slow code 895 */ 896 vn_lock(vp, LK_UPGRADE | LK_RETRY); 897 if (node && ap->a_fpp) { 898 int exists; 899 900 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_open: -1.1-\n"); 901 lockmgr(&devfs_lock, LK_SHARED); 902 903 ndev = devfs_clone(dev, node->d_dir.d_name, 904 node->d_dir.d_namlen, 905 ap->a_mode, ap->a_cred); 906 if (ndev != NULL) { 907 lockmgr(&devfs_lock, LK_RELEASE); 908 lockmgr(&devfs_lock, LK_EXCLUSIVE); 909 newnode = devfs_create_device_node( 910 DEVFS_MNTDATA(vp->v_mount)->root_node, 911 ndev, &exists, NULL, NULL); 912 /* XXX: possibly destroy device if this happens */ 913 914 if (newnode != NULL) { 915 dev = ndev; 916 if (exists == 0) 917 devfs_link_dev(dev); 918 919 devfs_debug(DEVFS_DEBUG_DEBUG, 920 "parent here is: %s, node is: |%s|\n", 921 ((node->parent->node_type == Nroot) ? 922 "ROOT!" : node->parent->d_dir.d_name), 923 newnode->d_dir.d_name); 924 devfs_debug(DEVFS_DEBUG_DEBUG, 925 "test: %s\n", 926 ((struct devfs_node *)(TAILQ_LAST(DEVFS_DENODE_HEAD(node->parent), devfs_node_head)))->d_dir.d_name); 927 928 /* 929 * orig_vp is set to the original vp if we 930 * cloned. 931 */ 932 /* node->flags |= DEVFS_CLONED; */ 933 devfs_allocv(&vp, newnode); 934 orig_vp = ap->a_vp; 935 ap->a_vp = vp; 936 } 937 } 938 lockmgr(&devfs_lock, LK_RELEASE); 939 940 /* 941 * Synchronize devfs here to make sure that, if the cloned 942 * device creates other device nodes in addition to the 943 * cloned one, all of them are created by the time we return 944 * from opening the cloned one. 945 */ 946 if (ndev) 947 devfs_config(); 948 } 949 950 devfs_debug(DEVFS_DEBUG_DEBUG, 951 "devfs_spec_open() called on %s! \n", 952 dev->si_name); 953 954 /* 955 * Make this field valid before any I/O in ->d_open 956 * 957 * NOTE: Shared vnode lock probably held, but its ok as long 958 * as assignments are consistent. 959 */ 960 if (!dev->si_iosize_max) 961 /* XXX: old DFLTPHYS == 64KB dependency */ 962 dev->si_iosize_max = min(MAXPHYS,64*1024); 963 964 if (dev_dflags(dev) & D_TTY) 965 vsetflags(vp, VISTTY); 966 967 /* 968 * Open the underlying device. 969 * 970 * NOTE: If the dev open returns EALREADY it has completed the open 971 * operation and is returning a fully initialized *a->a_fpp 972 * (which it may also have replaced). This includes issuing 973 * any necessary VOP_OPEN(). 974 * 975 * Also, the returned ap->a_fpp might not be DTYPE_VNODE and 976 * if it is might not be using the vp we supplied to it. 977 */ 978 vn_unlock(vp); 979 error = dev_dopen(dev, ap->a_mode, S_IFCHR, 980 ap->a_cred, ap->a_fpp, vp); 981 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 982 983 if (__predict_false(error == EALREADY)) { 984 if (orig_vp) 985 vput(vp); 986 return 0; 987 } 988 989 /* 990 * Clean up any cloned vp if we error out. 991 */ 992 if (__predict_false(error != 0)) { 993 if (orig_vp) { 994 vput(vp); 995 ap->a_vp = orig_vp; 996 /* orig_vp = NULL; */ 997 } 998 return error; 999 } 1000 1001 /* 1002 * This checks if the disk device is going to be opened for writing. 1003 * It will be only allowed in the cases where securelevel permits it 1004 * and it's not mounted R/W. 1005 */ 1006 if ((dev_dflags(dev) & D_DISK) && (ap->a_mode & FWRITE) && 1007 (ap->a_cred != FSCRED)) { 1008 1009 /* Very secure mode. No open for writing allowed */ 1010 if (securelevel >= 2) 1011 return EPERM; 1012 1013 /* 1014 * If it is mounted R/W, do not allow to open for writing. 1015 * In the case it's mounted read-only but securelevel 1016 * is >= 1, then do not allow opening for writing either. 1017 */ 1018 if (vfs_mountedon(vp)) { 1019 if (!(dev->si_mountpoint->mnt_flag & MNT_RDONLY)) 1020 return EBUSY; 1021 else if (securelevel >= 1) 1022 return EPERM; 1023 } 1024 } 1025 1026 /* 1027 * NOTE: vnode is still locked shared. t_stop assignment should 1028 * remain consistent so we should be ok. 1029 */ 1030 if (dev_dflags(dev) & D_TTY) { 1031 if (dev->si_tty) { 1032 struct tty *tp; 1033 tp = dev->si_tty; 1034 if (!tp->t_stop) { 1035 devfs_debug(DEVFS_DEBUG_DEBUG, 1036 "devfs: no t_stop\n"); 1037 tp->t_stop = nottystop; 1038 } 1039 } 1040 } 1041 1042 /* 1043 * NOTE: vnode is still locked shared. assignments should 1044 * remain consistent so we should be ok. However, 1045 * upgrade to exclusive if we need a VM object. 1046 */ 1047 if (vn_isdisk(vp, NULL)) { 1048 if (!dev->si_bsize_phys) 1049 dev->si_bsize_phys = DEV_BSIZE; 1050 vinitvmio(vp, IDX_TO_OFF(INT_MAX), PAGE_SIZE, -1); 1051 } 1052 1053 vop_stdopen(ap); 1054 #if 0 1055 if (node) 1056 vfs_timestamp(&node->atime); 1057 #endif 1058 /* 1059 * If we replaced the vp the vop_stdopen() call will have loaded 1060 * it into fp->f_data and vref()d the vp, giving us two refs. So 1061 * instead of just unlocking it here we have to vput() it. 1062 */ 1063 if (orig_vp) 1064 vput(vp); 1065 1066 /* Ugly pty magic, to make pty devices appear once they are opened */ 1067 if (node && (node->flags & DEVFS_PTY) == DEVFS_PTY) { 1068 if (node->flags & DEVFS_INVISIBLE) 1069 node->flags &= ~DEVFS_INVISIBLE; 1070 } 1071 1072 skip: 1073 if (ap->a_fpp) { 1074 struct file *fp = *ap->a_fpp; 1075 1076 KKASSERT(fp->f_type == DTYPE_VNODE); 1077 KKASSERT((fp->f_flag & FMASK) == (ap->a_mode & FMASK)); 1078 fp->f_ops = &devfs_dev_fileops; 1079 KKASSERT(fp->f_data == (void *)vp); 1080 } 1081 1082 return 0; 1083 } 1084 1085 static int 1086 devfs_spec_close(struct vop_close_args *ap) 1087 { 1088 struct devfs_node *node; 1089 struct proc *p = curproc; 1090 struct vnode *vp = ap->a_vp; 1091 cdev_t dev = vp->v_rdev; 1092 int error = 0; 1093 int needrelock; 1094 int opencount; 1095 1096 /* 1097 * Devices flagged D_QUICK require no special handling. 1098 */ 1099 if (dev && dev_dflags(dev) & D_QUICK) { 1100 opencount = vp->v_opencount; 1101 if (opencount <= 1) 1102 opencount = count_dev(dev); /* XXX NOT SMP SAFE */ 1103 if (((vp->v_flag & VRECLAIMED) || 1104 (dev_dflags(dev) & D_TRACKCLOSE) || 1105 (opencount == 1))) { 1106 vn_unlock(vp); 1107 error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp); 1108 vn_lock(vp, LK_SHARED | LK_RETRY); 1109 } 1110 goto skip; 1111 } 1112 1113 /* 1114 * We do special tests on the opencount so unfortunately we need 1115 * an exclusive lock. 1116 */ 1117 vn_lock(vp, LK_UPGRADE | LK_RETRY); 1118 1119 if (dev) 1120 devfs_debug(DEVFS_DEBUG_DEBUG, 1121 "devfs_spec_close() called on %s! \n", 1122 dev->si_name); 1123 else 1124 devfs_debug(DEVFS_DEBUG_DEBUG, 1125 "devfs_spec_close() called, null vode!\n"); 1126 1127 /* 1128 * A couple of hacks for devices and tty devices. The 1129 * vnode ref count cannot be used to figure out the 1130 * last close, but we can use v_opencount now that 1131 * revoke works properly. 1132 * 1133 * Detect the last close on a controlling terminal and clear 1134 * the session (half-close). 1135 * 1136 * XXX opencount is not SMP safe. The vnode is locked but there 1137 * may be multiple vnodes referencing the same device. 1138 */ 1139 if (dev) { 1140 /* 1141 * NOTE: Try to avoid global tokens when testing opencount 1142 * XXX hack, fixme. needs a struct lock and opencount in 1143 * struct cdev itself. 1144 */ 1145 reference_dev(dev); 1146 opencount = vp->v_opencount; 1147 if (opencount <= 1) 1148 opencount = count_dev(dev); /* XXX NOT SMP SAFE */ 1149 } else { 1150 opencount = 0; 1151 } 1152 1153 if (p && vp->v_opencount <= 1 && vp == p->p_session->s_ttyvp) { 1154 p->p_session->s_ttyvp = NULL; 1155 vrele(vp); 1156 } 1157 1158 /* 1159 * Vnodes can be opened and closed multiple times. Do not really 1160 * close the device unless (1) it is being closed forcibly, 1161 * (2) the device wants to track closes, or (3) this is the last 1162 * vnode doing its last close on the device. 1163 * 1164 * XXX the VXLOCK (force close) case can leave vnodes referencing 1165 * a closed device. This might not occur now that our revoke is 1166 * fixed. 1167 */ 1168 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -1- \n"); 1169 if (dev && ((vp->v_flag & VRECLAIMED) || 1170 (dev_dflags(dev) & D_TRACKCLOSE) || 1171 (opencount == 1))) { 1172 /* 1173 * Ugly pty magic, to make pty devices disappear again once 1174 * they are closed. 1175 */ 1176 node = DEVFS_NODE(ap->a_vp); 1177 if (node && (node->flags & DEVFS_PTY)) 1178 node->flags |= DEVFS_INVISIBLE; 1179 1180 /* 1181 * Unlock around dev_dclose(), unless the vnode is 1182 * undergoing a vgone/reclaim (during umount). 1183 */ 1184 needrelock = 0; 1185 if ((vp->v_flag & VRECLAIMED) == 0 && vn_islocked(vp)) { 1186 needrelock = 1; 1187 vn_unlock(vp); 1188 } 1189 1190 /* 1191 * WARNING! If the device destroys itself the devfs node 1192 * can disappear here. 1193 * 1194 * WARNING! vn_lock() will fail if the vp is in a VRECLAIM, 1195 * which can occur during umount. 1196 */ 1197 error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp); 1198 /* node is now stale */ 1199 1200 if (needrelock) { 1201 if (vn_lock(vp, LK_EXCLUSIVE | 1202 LK_RETRY | 1203 LK_FAILRECLAIM) != 0) { 1204 panic("devfs_spec_close: vnode %p " 1205 "unexpectedly could not be relocked", 1206 vp); 1207 } 1208 } 1209 } else { 1210 error = 0; 1211 } 1212 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -2- \n"); 1213 1214 /* 1215 * Track the actual opens and closes on the vnode. The last close 1216 * disassociates the rdev. If the rdev is already disassociated or 1217 * the opencount is already 0, the vnode might have been revoked 1218 * and no further opencount tracking occurs. 1219 */ 1220 if (dev) 1221 release_dev(dev); 1222 skip: 1223 if (vp->v_opencount > 0) 1224 vop_stdclose(ap); 1225 return(error); 1226 1227 } 1228 1229 1230 static int 1231 devfs_fo_close(struct file *fp) 1232 { 1233 struct vnode *vp = (struct vnode *)fp->f_data; 1234 int error; 1235 1236 fp->f_ops = &badfileops; 1237 error = vn_close(vp, fp->f_flag, fp); 1238 devfs_clear_cdevpriv(fp); 1239 1240 return (error); 1241 } 1242 1243 1244 /* 1245 * Device-optimized file table vnode read routine. 1246 * 1247 * This bypasses the VOP table and talks directly to the device. Most 1248 * filesystems just route to specfs and can make this optimization. 1249 */ 1250 static int 1251 devfs_fo_read(struct file *fp, struct uio *uio, 1252 struct ucred *cred, int flags) 1253 { 1254 struct devfs_node *node; 1255 struct vnode *vp; 1256 int ioflag; 1257 int error; 1258 cdev_t dev; 1259 1260 KASSERT(uio->uio_td == curthread, 1261 ("uio_td %p is not td %p", uio->uio_td, curthread)); 1262 1263 if (uio->uio_resid == 0) 1264 return 0; 1265 1266 vp = (struct vnode *)fp->f_data; 1267 if (vp == NULL || vp->v_type == VBAD) 1268 return EBADF; 1269 1270 node = DEVFS_NODE(vp); 1271 1272 if ((dev = vp->v_rdev) == NULL) 1273 return EBADF; 1274 1275 reference_dev(dev); 1276 1277 if ((flags & O_FOFFSET) == 0) 1278 uio->uio_offset = fp->f_offset; 1279 1280 ioflag = 0; 1281 if (flags & O_FBLOCKING) { 1282 /* ioflag &= ~IO_NDELAY; */ 1283 } else if (flags & O_FNONBLOCKING) { 1284 ioflag |= IO_NDELAY; 1285 } else if (fp->f_flag & FNONBLOCK) { 1286 ioflag |= IO_NDELAY; 1287 } 1288 if (fp->f_flag & O_DIRECT) { 1289 ioflag |= IO_DIRECT; 1290 } 1291 ioflag |= sequential_heuristic(uio, fp); 1292 1293 error = dev_dread(dev, uio, ioflag, fp); 1294 1295 release_dev(dev); 1296 if (node) 1297 vfs_timestamp(&node->atime); 1298 if ((flags & O_FOFFSET) == 0) 1299 fp->f_offset = uio->uio_offset; 1300 fp->f_nextoff = uio->uio_offset; 1301 1302 return (error); 1303 } 1304 1305 1306 static int 1307 devfs_fo_write(struct file *fp, struct uio *uio, 1308 struct ucred *cred, int flags) 1309 { 1310 struct devfs_node *node; 1311 struct vnode *vp; 1312 int ioflag; 1313 int error; 1314 cdev_t dev; 1315 1316 KASSERT(uio->uio_td == curthread, 1317 ("uio_td %p is not p %p", uio->uio_td, curthread)); 1318 1319 vp = (struct vnode *)fp->f_data; 1320 if (vp == NULL || vp->v_type == VBAD) 1321 return EBADF; 1322 1323 node = DEVFS_NODE(vp); 1324 1325 if (vp->v_type == VREG) 1326 bwillwrite(uio->uio_resid); 1327 1328 vp = (struct vnode *)fp->f_data; 1329 1330 if ((dev = vp->v_rdev) == NULL) 1331 return EBADF; 1332 1333 reference_dev(dev); 1334 1335 if ((flags & O_FOFFSET) == 0) 1336 uio->uio_offset = fp->f_offset; 1337 1338 ioflag = IO_UNIT; 1339 if (vp->v_type == VREG && 1340 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 1341 ioflag |= IO_APPEND; 1342 } 1343 1344 if (flags & O_FBLOCKING) { 1345 /* ioflag &= ~IO_NDELAY; */ 1346 } else if (flags & O_FNONBLOCKING) { 1347 ioflag |= IO_NDELAY; 1348 } else if (fp->f_flag & FNONBLOCK) { 1349 ioflag |= IO_NDELAY; 1350 } 1351 if (fp->f_flag & O_DIRECT) { 1352 ioflag |= IO_DIRECT; 1353 } 1354 if (flags & O_FASYNCWRITE) { 1355 /* ioflag &= ~IO_SYNC; */ 1356 } else if (flags & O_FSYNCWRITE) { 1357 ioflag |= IO_SYNC; 1358 } else if (fp->f_flag & O_FSYNC) { 1359 ioflag |= IO_SYNC; 1360 } 1361 1362 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 1363 ioflag |= IO_SYNC; 1364 ioflag |= sequential_heuristic(uio, fp); 1365 1366 error = dev_dwrite(dev, uio, ioflag, fp); 1367 1368 release_dev(dev); 1369 if (node) { 1370 vfs_timestamp(&node->atime); 1371 vfs_timestamp(&node->mtime); 1372 } 1373 1374 if ((flags & O_FOFFSET) == 0) 1375 fp->f_offset = uio->uio_offset; 1376 fp->f_nextoff = uio->uio_offset; 1377 1378 return (error); 1379 } 1380 1381 1382 static int 1383 devfs_fo_stat(struct file *fp, struct stat *sb, struct ucred *cred) 1384 { 1385 struct vnode *vp; 1386 struct vattr vattr; 1387 struct vattr *vap; 1388 u_short mode; 1389 cdev_t dev; 1390 int error; 1391 1392 vp = (struct vnode *)fp->f_data; 1393 if (vp == NULL || vp->v_type == VBAD) 1394 return EBADF; 1395 1396 error = vn_stat(vp, sb, cred); 1397 if (error) 1398 return (error); 1399 1400 vap = &vattr; 1401 error = VOP_GETATTR(vp, vap); 1402 if (error) 1403 return (error); 1404 1405 /* 1406 * Zero the spare stat fields 1407 */ 1408 sb->st_lspare = 0; 1409 sb->st_qspare2 = 0; 1410 1411 /* 1412 * Copy from vattr table ... or not in case it's a cloned device 1413 */ 1414 if (vap->va_fsid != VNOVAL) 1415 sb->st_dev = vap->va_fsid; 1416 else 1417 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 1418 1419 sb->st_ino = vap->va_fileid; 1420 1421 mode = vap->va_mode; 1422 mode |= S_IFCHR; 1423 sb->st_mode = mode; 1424 1425 if (vap->va_nlink > (nlink_t)-1) 1426 sb->st_nlink = (nlink_t)-1; 1427 else 1428 sb->st_nlink = vap->va_nlink; 1429 1430 sb->st_uid = vap->va_uid; 1431 sb->st_gid = vap->va_gid; 1432 sb->st_rdev = devid_from_dev(DEVFS_NODE(vp)->d_dev); 1433 sb->st_size = vap->va_bytes; 1434 sb->st_atimespec = vap->va_atime; 1435 sb->st_mtimespec = vap->va_mtime; 1436 sb->st_ctimespec = vap->va_ctime; 1437 1438 /* 1439 * A VCHR and VBLK device may track the last access and last modified 1440 * time independantly of the filesystem. This is particularly true 1441 * because device read and write calls may bypass the filesystem. 1442 */ 1443 if (vp->v_type == VCHR || vp->v_type == VBLK) { 1444 dev = vp->v_rdev; 1445 if (dev != NULL) { 1446 if (dev->si_lastread) { 1447 sb->st_atimespec.tv_sec = time_second + 1448 (dev->si_lastread - 1449 time_uptime); 1450 sb->st_atimespec.tv_nsec = 0; 1451 } 1452 if (dev->si_lastwrite) { 1453 sb->st_mtimespec.tv_sec = time_second + 1454 (dev->si_lastwrite - 1455 time_uptime); 1456 sb->st_mtimespec.tv_nsec = 0; 1457 } 1458 } 1459 } 1460 1461 /* 1462 * According to www.opengroup.org, the meaning of st_blksize is 1463 * "a filesystem-specific preferred I/O block size for this 1464 * object. In some filesystem types, this may vary from file 1465 * to file" 1466 * Default to PAGE_SIZE after much discussion. 1467 */ 1468 1469 sb->st_blksize = PAGE_SIZE; 1470 1471 sb->st_flags = vap->va_flags; 1472 1473 error = caps_priv_check(cred, SYSCAP_NOVFS_GENERATION); 1474 if (error) 1475 sb->st_gen = 0; 1476 else 1477 sb->st_gen = (u_int32_t)vap->va_gen; 1478 1479 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1480 1481 /* 1482 * This is for ABI compatibility <= 5.7 (for ABI change made in 1483 * 5.7 master). 1484 */ 1485 sb->__old_st_blksize = sb->st_blksize; 1486 1487 return (0); 1488 } 1489 1490 1491 static int 1492 devfs_fo_kqfilter(struct file *fp, struct knote *kn) 1493 { 1494 struct vnode *vp; 1495 int error; 1496 cdev_t dev; 1497 1498 vp = (struct vnode *)fp->f_data; 1499 if (vp == NULL || vp->v_type == VBAD) { 1500 error = EBADF; 1501 goto done; 1502 } 1503 if ((dev = vp->v_rdev) == NULL) { 1504 error = EBADF; 1505 goto done; 1506 } 1507 reference_dev(dev); 1508 1509 error = dev_dkqfilter(dev, kn, fp); 1510 1511 release_dev(dev); 1512 1513 done: 1514 return (error); 1515 } 1516 1517 static int 1518 devfs_fo_ioctl(struct file *fp, u_long com, caddr_t data, 1519 struct ucred *ucred, struct sysmsg *msg) 1520 { 1521 #if 0 1522 struct devfs_node *node; 1523 #endif 1524 struct vnode *vp; 1525 struct vnode *ovp; 1526 cdev_t dev; 1527 int error; 1528 struct fiodname_args *name_args; 1529 size_t namlen; 1530 const char *name; 1531 1532 vp = ((struct vnode *)fp->f_data); 1533 1534 if ((dev = vp->v_rdev) == NULL) 1535 return EBADF; /* device was revoked */ 1536 1537 reference_dev(dev); 1538 1539 #if 0 1540 node = DEVFS_NODE(vp); 1541 #endif 1542 1543 devfs_debug(DEVFS_DEBUG_DEBUG, 1544 "devfs_fo_ioctl() called! for dev %s\n", 1545 dev->si_name); 1546 1547 if (com == FIODTYPE) { 1548 *(int *)data = dev_dflags(dev) & D_TYPEMASK; 1549 error = 0; 1550 goto out; 1551 } else if (com == FIODNAME) { 1552 name_args = (struct fiodname_args *)data; 1553 name = dev->si_name; 1554 namlen = strlen(name) + 1; 1555 1556 devfs_debug(DEVFS_DEBUG_DEBUG, 1557 "ioctl, got: FIODNAME for %s\n", name); 1558 1559 if (namlen <= name_args->len) 1560 error = copyout(dev->si_name, name_args->name, namlen); 1561 else 1562 error = EINVAL; 1563 1564 devfs_debug(DEVFS_DEBUG_DEBUG, 1565 "ioctl stuff: error: %d\n", error); 1566 goto out; 1567 } 1568 1569 error = dev_dioctl(dev, com, data, fp->f_flag, ucred, msg, fp); 1570 1571 #if 0 1572 if (node) { 1573 vfs_timestamp(&node->atime); 1574 vfs_timestamp(&node->mtime); 1575 } 1576 #endif 1577 if (com == TIOCSCTTY) { 1578 devfs_debug(DEVFS_DEBUG_DEBUG, 1579 "devfs_fo_ioctl: got TIOCSCTTY on %s\n", 1580 dev->si_name); 1581 } 1582 if (error == 0 && com == TIOCSCTTY) { 1583 struct proc *p = curthread->td_proc; 1584 struct session *sess; 1585 1586 devfs_debug(DEVFS_DEBUG_DEBUG, 1587 "devfs_fo_ioctl: dealing with TIOCSCTTY on %s\n", 1588 dev->si_name); 1589 if (p == NULL) { 1590 error = ENOTTY; 1591 goto out; 1592 } 1593 sess = p->p_session; 1594 1595 /* 1596 * Do nothing if reassigning same control tty 1597 */ 1598 if (sess->s_ttyvp == vp) { 1599 error = 0; 1600 goto out; 1601 } 1602 1603 /* 1604 * Get rid of reference to old control tty 1605 */ 1606 ovp = sess->s_ttyvp; 1607 vref(vp); 1608 sess->s_ttyvp = vp; 1609 if (ovp) 1610 vrele(ovp); 1611 } 1612 1613 out: 1614 release_dev(dev); 1615 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_fo_ioctl() finished! \n"); 1616 return (error); 1617 } 1618 1619 int 1620 devfs_fo_seek(struct file *fp, off_t offset, int whence, off_t *res) 1621 { 1622 /* 1623 * NOTE: vnode_fileops uses exact same code 1624 */ 1625 struct vnode *vp; 1626 struct vattr_lite lva; 1627 off_t new_offset; 1628 int error; 1629 1630 vp = (struct vnode *)fp->f_data; 1631 1632 switch (whence) { 1633 case L_INCR: 1634 spin_lock(&fp->f_spin); 1635 new_offset = fp->f_offset + offset; 1636 error = 0; 1637 break; 1638 case L_XTND: 1639 error = VOP_GETATTR_LITE(vp, &lva); 1640 spin_lock(&fp->f_spin); 1641 new_offset = offset + lva.va_size; 1642 break; 1643 case L_SET: 1644 new_offset = offset; 1645 error = 0; 1646 spin_lock(&fp->f_spin); 1647 break; 1648 default: 1649 new_offset = 0; 1650 error = EINVAL; 1651 spin_lock(&fp->f_spin); 1652 break; 1653 } 1654 1655 /* 1656 * Validate the seek position. Negative offsets are not allowed 1657 * for regular files or directories. 1658 * 1659 * Normally we would also not want to allow negative offsets for 1660 * character and block-special devices. However kvm addresses 1661 * on 64 bit architectures might appear to be negative and must 1662 * be allowed. 1663 */ 1664 if (error == 0) { 1665 if (new_offset < 0 && 1666 (vp->v_type == VREG || vp->v_type == VDIR)) { 1667 error = EINVAL; 1668 } else { 1669 fp->f_offset = new_offset; 1670 } 1671 } 1672 *res = fp->f_offset; 1673 spin_unlock(&fp->f_spin); 1674 1675 return (error); 1676 } 1677 1678 static int 1679 devfs_spec_fsync(struct vop_fsync_args *ap) 1680 { 1681 struct vnode *vp = ap->a_vp; 1682 int error; 1683 1684 if (!vn_isdisk(vp, NULL)) 1685 return (0); 1686 1687 /* 1688 * Flush all dirty buffers associated with a block device. 1689 */ 1690 error = vfsync(vp, ap->a_waitfor, 10000, NULL, NULL); 1691 return (error); 1692 } 1693 1694 static int 1695 devfs_spec_read(struct vop_read_args *ap) 1696 { 1697 struct devfs_node *node; 1698 struct vnode *vp; 1699 struct uio *uio; 1700 cdev_t dev; 1701 int error; 1702 1703 vp = ap->a_vp; 1704 dev = vp->v_rdev; 1705 uio = ap->a_uio; 1706 node = DEVFS_NODE(vp); 1707 1708 if (dev == NULL) /* device was revoked */ 1709 return (EBADF); 1710 if (uio->uio_resid == 0) 1711 return (0); 1712 1713 vn_unlock(vp); 1714 error = dev_dread(dev, uio, ap->a_ioflag, NULL); 1715 vn_lock(vp, LK_SHARED | LK_RETRY); 1716 1717 if (node) 1718 vfs_timestamp(&node->atime); 1719 1720 return (error); 1721 } 1722 1723 /* 1724 * Vnode op for write 1725 * 1726 * spec_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag, 1727 * struct ucred *a_cred) 1728 */ 1729 static int 1730 devfs_spec_write(struct vop_write_args *ap) 1731 { 1732 struct devfs_node *node; 1733 struct vnode *vp; 1734 struct uio *uio; 1735 cdev_t dev; 1736 int error; 1737 1738 vp = ap->a_vp; 1739 dev = vp->v_rdev; 1740 uio = ap->a_uio; 1741 node = DEVFS_NODE(vp); 1742 1743 KKASSERT(uio->uio_segflg != UIO_NOCOPY); 1744 1745 if (dev == NULL) /* device was revoked */ 1746 return (EBADF); 1747 1748 vn_unlock(vp); 1749 error = dev_dwrite(dev, uio, ap->a_ioflag, NULL); 1750 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1751 1752 if (node) { 1753 vfs_timestamp(&node->atime); 1754 vfs_timestamp(&node->mtime); 1755 } 1756 1757 return (error); 1758 } 1759 1760 /* 1761 * Device ioctl operation. 1762 * 1763 * spec_ioctl(struct vnode *a_vp, int a_command, caddr_t a_data, 1764 * int a_fflag, struct ucred *a_cred, struct sysmsg *msg) 1765 */ 1766 static int 1767 devfs_spec_ioctl(struct vop_ioctl_args *ap) 1768 { 1769 struct vnode *vp = ap->a_vp; 1770 #if 0 1771 struct devfs_node *node; 1772 #endif 1773 cdev_t dev; 1774 1775 if ((dev = vp->v_rdev) == NULL) 1776 return (EBADF); /* device was revoked */ 1777 #if 0 1778 node = DEVFS_NODE(vp); 1779 1780 if (node) { 1781 vfs_timestamp(&node->atime); 1782 vfs_timestamp(&node->mtime); 1783 } 1784 #endif 1785 1786 return (dev_dioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, 1787 ap->a_cred, ap->a_sysmsg, NULL)); 1788 } 1789 1790 /* 1791 * spec_kqfilter(struct vnode *a_vp, struct knote *a_kn) 1792 */ 1793 /* ARGSUSED */ 1794 static int 1795 devfs_spec_kqfilter(struct vop_kqfilter_args *ap) 1796 { 1797 struct vnode *vp = ap->a_vp; 1798 #if 0 1799 struct devfs_node *node; 1800 #endif 1801 cdev_t dev; 1802 1803 if ((dev = vp->v_rdev) == NULL) 1804 return (EBADF); /* device was revoked (EBADF) */ 1805 #if 0 1806 node = DEVFS_NODE(vp); 1807 1808 if (node) 1809 vfs_timestamp(&node->atime); 1810 #endif 1811 1812 return (dev_dkqfilter(dev, ap->a_kn, NULL)); 1813 } 1814 1815 /* 1816 * Convert a vnode strategy call into a device strategy call. Vnode strategy 1817 * calls are not limited to device DMA limits so we have to deal with the 1818 * case. 1819 * 1820 * spec_strategy(struct vnode *a_vp, struct bio *a_bio) 1821 */ 1822 static int 1823 devfs_spec_strategy(struct vop_strategy_args *ap) 1824 { 1825 struct bio *bio = ap->a_bio; 1826 struct buf *bp = bio->bio_buf; 1827 struct buf *nbp; 1828 struct vnode *vp; 1829 struct mount *mp; 1830 int chunksize; 1831 int maxiosize; 1832 1833 if (bp->b_cmd != BUF_CMD_READ && LIST_FIRST(&bp->b_dep) != NULL) 1834 buf_start(bp); 1835 1836 /* 1837 * Collect statistics on synchronous and asynchronous read 1838 * and write counts for disks that have associated filesystems. 1839 */ 1840 vp = ap->a_vp; 1841 KKASSERT(vp->v_rdev != NULL); /* XXX */ 1842 if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) { 1843 if (bp->b_cmd == BUF_CMD_READ) { 1844 if (bp->b_flags & BIO_SYNC) 1845 mp->mnt_stat.f_syncreads++; 1846 else 1847 mp->mnt_stat.f_asyncreads++; 1848 } else { 1849 if (bp->b_flags & BIO_SYNC) 1850 mp->mnt_stat.f_syncwrites++; 1851 else 1852 mp->mnt_stat.f_asyncwrites++; 1853 } 1854 } 1855 1856 /* 1857 * Device iosize limitations only apply to read and write. Shortcut 1858 * the I/O if it fits. 1859 */ 1860 if ((maxiosize = vp->v_rdev->si_iosize_max) == 0) { 1861 devfs_debug(DEVFS_DEBUG_DEBUG, 1862 "%s: si_iosize_max not set!\n", 1863 dev_dname(vp->v_rdev)); 1864 maxiosize = MAXPHYS; 1865 } 1866 #if SPEC_CHAIN_DEBUG & 2 1867 maxiosize = 4096; 1868 #endif 1869 if (bp->b_bcount <= maxiosize || 1870 (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) { 1871 dev_dstrategy_chain(vp->v_rdev, bio); 1872 return (0); 1873 } 1874 1875 /* 1876 * Clone the buffer and set up an I/O chain to chunk up the I/O. 1877 */ 1878 nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO); 1879 initbufbio(nbp); 1880 buf_dep_init(nbp); 1881 BUF_LOCK(nbp, LK_EXCLUSIVE); 1882 BUF_KERNPROC(nbp); 1883 nbp->b_vp = vp; 1884 nbp->b_flags = B_PAGING | B_KVABIO | (bp->b_flags & B_BNOCLIP); 1885 nbp->b_cpumask = bp->b_cpumask; 1886 nbp->b_data = bp->b_data; 1887 nbp->b_bio1.bio_done = devfs_spec_strategy_done; 1888 nbp->b_bio1.bio_offset = bio->bio_offset; 1889 nbp->b_bio1.bio_caller_info1.ptr = bio; 1890 1891 /* 1892 * Start the first transfer 1893 */ 1894 if (vn_isdisk(vp, NULL)) 1895 chunksize = vp->v_rdev->si_bsize_phys; 1896 else 1897 chunksize = DEV_BSIZE; 1898 chunksize = rounddown(maxiosize, chunksize); 1899 #if SPEC_CHAIN_DEBUG & 1 1900 devfs_debug(DEVFS_DEBUG_DEBUG, 1901 "spec_strategy chained I/O chunksize=%d\n", 1902 chunksize); 1903 #endif 1904 nbp->b_cmd = bp->b_cmd; 1905 nbp->b_bcount = chunksize; 1906 nbp->b_bufsize = chunksize; /* used to detect a short I/O */ 1907 nbp->b_bio1.bio_caller_info2.index = chunksize; 1908 1909 #if SPEC_CHAIN_DEBUG & 1 1910 devfs_debug(DEVFS_DEBUG_DEBUG, 1911 "spec_strategy: chain %p offset %d/%d bcount %d\n", 1912 bp, 0, bp->b_bcount, nbp->b_bcount); 1913 #endif 1914 1915 dev_dstrategy(vp->v_rdev, &nbp->b_bio1); 1916 1917 if (DEVFS_NODE(vp)) { 1918 vfs_timestamp(&DEVFS_NODE(vp)->atime); 1919 vfs_timestamp(&DEVFS_NODE(vp)->mtime); 1920 } 1921 1922 return (0); 1923 } 1924 1925 /* 1926 * Chunked up transfer completion routine - chain transfers until done 1927 * 1928 * NOTE: MPSAFE callback. 1929 */ 1930 static 1931 void 1932 devfs_spec_strategy_done(struct bio *nbio) 1933 { 1934 struct buf *nbp = nbio->bio_buf; 1935 struct bio *bio = nbio->bio_caller_info1.ptr; /* original bio */ 1936 struct buf *bp = bio->bio_buf; /* original bp */ 1937 int chunksize = nbio->bio_caller_info2.index; /* chunking */ 1938 int boffset = nbp->b_data - bp->b_data; 1939 1940 if (nbp->b_flags & B_ERROR) { 1941 /* 1942 * An error terminates the chain, propogate the error back 1943 * to the original bp 1944 */ 1945 bp->b_flags |= B_ERROR; 1946 bp->b_error = nbp->b_error; 1947 bp->b_resid = bp->b_bcount - boffset + 1948 (nbp->b_bcount - nbp->b_resid); 1949 #if SPEC_CHAIN_DEBUG & 1 1950 devfs_debug(DEVFS_DEBUG_DEBUG, 1951 "spec_strategy: chain %p error %d bcount %d/%d\n", 1952 bp, bp->b_error, bp->b_bcount, 1953 bp->b_bcount - bp->b_resid); 1954 #endif 1955 } else if (nbp->b_resid) { 1956 /* 1957 * A short read or write terminates the chain 1958 */ 1959 bp->b_error = nbp->b_error; 1960 bp->b_resid = bp->b_bcount - boffset + 1961 (nbp->b_bcount - nbp->b_resid); 1962 #if SPEC_CHAIN_DEBUG & 1 1963 devfs_debug(DEVFS_DEBUG_DEBUG, 1964 "spec_strategy: chain %p short read(1) " 1965 "bcount %d/%d\n", 1966 bp, bp->b_bcount - bp->b_resid, bp->b_bcount); 1967 #endif 1968 } else if (nbp->b_bcount != nbp->b_bufsize) { 1969 /* 1970 * A short read or write can also occur by truncating b_bcount 1971 */ 1972 #if SPEC_CHAIN_DEBUG & 1 1973 devfs_debug(DEVFS_DEBUG_DEBUG, 1974 "spec_strategy: chain %p short read(2) " 1975 "bcount %d/%d\n", 1976 bp, nbp->b_bcount + boffset, bp->b_bcount); 1977 #endif 1978 bp->b_error = 0; 1979 bp->b_bcount = nbp->b_bcount + boffset; 1980 bp->b_resid = nbp->b_resid; 1981 } else if (nbp->b_bcount + boffset == bp->b_bcount) { 1982 /* 1983 * No more data terminates the chain 1984 */ 1985 #if SPEC_CHAIN_DEBUG & 1 1986 devfs_debug(DEVFS_DEBUG_DEBUG, 1987 "spec_strategy: chain %p finished bcount %d\n", 1988 bp, bp->b_bcount); 1989 #endif 1990 bp->b_error = 0; 1991 bp->b_resid = 0; 1992 } else { 1993 /* 1994 * Continue the chain 1995 */ 1996 boffset += nbp->b_bcount; 1997 nbp->b_data = bp->b_data + boffset; 1998 nbp->b_bcount = bp->b_bcount - boffset; 1999 if (nbp->b_bcount > chunksize) 2000 nbp->b_bcount = chunksize; 2001 nbp->b_bio1.bio_done = devfs_spec_strategy_done; 2002 nbp->b_bio1.bio_offset = bio->bio_offset + boffset; 2003 2004 #if SPEC_CHAIN_DEBUG & 1 2005 devfs_debug(DEVFS_DEBUG_DEBUG, 2006 "spec_strategy: chain %p offset %d/%d bcount %d\n", 2007 bp, boffset, bp->b_bcount, nbp->b_bcount); 2008 #endif 2009 2010 dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1); 2011 return; 2012 } 2013 2014 /* 2015 * Fall through to here on termination. biodone(bp) and 2016 * clean up and free nbp. 2017 */ 2018 biodone(bio); 2019 BUF_UNLOCK(nbp); 2020 uninitbufbio(nbp); 2021 kfree(nbp, M_DEVBUF); 2022 } 2023 2024 /* 2025 * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length) 2026 */ 2027 static int 2028 devfs_spec_freeblks(struct vop_freeblks_args *ap) 2029 { 2030 struct buf *bp; 2031 2032 /* 2033 * Must be a synchronous operation 2034 */ 2035 KKASSERT(ap->a_vp->v_rdev != NULL); 2036 if ((ap->a_vp->v_rdev->si_flags & SI_CANFREE) == 0) 2037 return (0); 2038 bp = getpbuf(NULL); 2039 bp->b_cmd = BUF_CMD_FREEBLKS; 2040 bp->b_bio1.bio_flags |= BIO_SYNC; 2041 bp->b_bio1.bio_offset = ap->a_offset; 2042 bp->b_bio1.bio_done = biodone_sync; 2043 bp->b_bcount = ap->a_length; 2044 dev_dstrategy(ap->a_vp->v_rdev, &bp->b_bio1); 2045 biowait(&bp->b_bio1, "TRIM"); 2046 relpbuf(bp, NULL); 2047 2048 return (0); 2049 } 2050 2051 /* 2052 * Implement degenerate case where the block requested is the block 2053 * returned, and assume that the entire device is contiguous in regards 2054 * to the contiguous block range (runp and runb). 2055 * 2056 * spec_bmap(struct vnode *a_vp, off_t a_loffset, 2057 * off_t *a_doffsetp, int *a_runp, int *a_runb) 2058 */ 2059 static int 2060 devfs_spec_bmap(struct vop_bmap_args *ap) 2061 { 2062 if (ap->a_doffsetp != NULL) 2063 *ap->a_doffsetp = ap->a_loffset; 2064 if (ap->a_runp != NULL) 2065 *ap->a_runp = MAXBSIZE; 2066 if (ap->a_runb != NULL) { 2067 if (ap->a_loffset < MAXBSIZE) 2068 *ap->a_runb = (int)ap->a_loffset; 2069 else 2070 *ap->a_runb = MAXBSIZE; 2071 } 2072 return (0); 2073 } 2074 2075 2076 /* 2077 * Special device advisory byte-level locks. 2078 * 2079 * spec_advlock(struct vnode *a_vp, caddr_t a_id, int a_op, 2080 * struct flock *a_fl, int a_flags) 2081 */ 2082 /* ARGSUSED */ 2083 static int 2084 devfs_spec_advlock(struct vop_advlock_args *ap) 2085 { 2086 return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP); 2087 } 2088 2089 /* 2090 * NOTE: MPSAFE callback. 2091 */ 2092 static void 2093 devfs_spec_getpages_iodone(struct bio *bio) 2094 { 2095 bio->bio_buf->b_cmd = BUF_CMD_DONE; 2096 wakeup(bio->bio_buf); 2097 } 2098 2099 /* 2100 * spec_getpages() - get pages associated with device vnode. 2101 * 2102 * Note that spec_read and spec_write do not use the buffer cache, so we 2103 * must fully implement getpages here. 2104 */ 2105 static int 2106 devfs_spec_getpages(struct vop_getpages_args *ap) 2107 { 2108 vm_offset_t kva; 2109 int error; 2110 int i, pcount, size; 2111 struct buf *bp; 2112 vm_page_t m; 2113 vm_ooffset_t offset; 2114 int toff, nextoff, nread; 2115 struct vnode *vp = ap->a_vp; 2116 int blksiz; 2117 int gotreqpage; 2118 2119 error = 0; 2120 pcount = round_page(ap->a_count) / PAGE_SIZE; 2121 2122 /* 2123 * Calculate the offset of the transfer and do sanity check. 2124 */ 2125 offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; 2126 2127 /* 2128 * Round up physical size for real devices. We cannot round using 2129 * v_mount's block size data because v_mount has nothing to do with 2130 * the device. i.e. it's usually '/dev'. We need the physical block 2131 * size for the device itself. 2132 * 2133 * We can't use v_rdev->si_mountpoint because it only exists when the 2134 * block device is mounted. However, we can use v_rdev. 2135 */ 2136 if (vn_isdisk(vp, NULL)) 2137 blksiz = vp->v_rdev->si_bsize_phys; 2138 else 2139 blksiz = DEV_BSIZE; 2140 2141 size = roundup2(ap->a_count, blksiz); 2142 2143 bp = getpbuf_kva(NULL); 2144 kva = (vm_offset_t)bp->b_data; 2145 2146 /* 2147 * Map the pages to be read into the kva. 2148 */ 2149 pmap_qenter_noinval(kva, ap->a_m, pcount); 2150 2151 /* Build a minimal buffer header. */ 2152 bp->b_cmd = BUF_CMD_READ; 2153 bp->b_flags |= B_KVABIO; 2154 bp->b_bcount = size; 2155 bp->b_resid = 0; 2156 bsetrunningbufspace(bp, size); 2157 2158 bp->b_bio1.bio_offset = offset; 2159 bp->b_bio1.bio_done = devfs_spec_getpages_iodone; 2160 2161 mycpu->gd_cnt.v_vnodein++; 2162 mycpu->gd_cnt.v_vnodepgsin += pcount; 2163 2164 /* Do the input. */ 2165 vn_strategy(ap->a_vp, &bp->b_bio1); 2166 2167 crit_enter(); 2168 2169 /* We definitely need to be at splbio here. */ 2170 while (bp->b_cmd != BUF_CMD_DONE) 2171 tsleep(bp, 0, "spread", 0); 2172 2173 crit_exit(); 2174 2175 if (bp->b_flags & B_ERROR) { 2176 if (bp->b_error) 2177 error = bp->b_error; 2178 else 2179 error = EIO; 2180 } 2181 2182 /* 2183 * If EOF is encountered we must zero-extend the result in order 2184 * to ensure that the page does not contain garabge. When no 2185 * error occurs, an early EOF is indicated if b_bcount got truncated. 2186 * b_resid is relative to b_bcount and should be 0, but some devices 2187 * might indicate an EOF with b_resid instead of truncating b_bcount. 2188 */ 2189 nread = bp->b_bcount - bp->b_resid; 2190 if (nread < ap->a_count) { 2191 bkvasync(bp); 2192 bzero((caddr_t)kva + nread, ap->a_count - nread); 2193 } 2194 pmap_qremove_noinval(kva, pcount); 2195 2196 gotreqpage = 0; 2197 for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { 2198 nextoff = toff + PAGE_SIZE; 2199 m = ap->a_m[i]; 2200 2201 /* 2202 * NOTE: vm_page_undirty/clear_dirty etc do not clear the 2203 * pmap modified bit. pmap modified bit should have 2204 * already been cleared. 2205 */ 2206 if (nextoff <= nread) { 2207 m->valid = VM_PAGE_BITS_ALL; 2208 vm_page_undirty(m); 2209 } else if (toff < nread) { 2210 /* 2211 * Since this is a VM request, we have to supply the 2212 * unaligned offset to allow vm_page_set_valid() 2213 * to zero sub-DEV_BSIZE'd portions of the page. 2214 */ 2215 vm_page_set_valid(m, 0, nread - toff); 2216 vm_page_clear_dirty_end_nonincl(m, 0, nread - toff); 2217 } else { 2218 m->valid = 0; 2219 vm_page_undirty(m); 2220 } 2221 2222 if (i != ap->a_reqpage) { 2223 /* 2224 * Just in case someone was asking for this page we 2225 * now tell them that it is ok to use. 2226 */ 2227 if (!error || (m->valid == VM_PAGE_BITS_ALL)) { 2228 if (m->valid) { 2229 if (m->flags & PG_REFERENCED) { 2230 vm_page_activate(m); 2231 } else { 2232 vm_page_deactivate(m); 2233 } 2234 vm_page_wakeup(m); 2235 } else { 2236 vm_page_free(m); 2237 } 2238 } else { 2239 vm_page_free(m); 2240 } 2241 } else if (m->valid) { 2242 gotreqpage = 1; 2243 /* 2244 * Since this is a VM request, we need to make the 2245 * entire page presentable by zeroing invalid sections. 2246 */ 2247 if (m->valid != VM_PAGE_BITS_ALL) 2248 vm_page_zero_invalid(m, FALSE); 2249 } 2250 } 2251 if (!gotreqpage) { 2252 m = ap->a_m[ap->a_reqpage]; 2253 devfs_debug(DEVFS_DEBUG_WARNING, 2254 "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n", 2255 devtoname(vp->v_rdev), error, bp, bp->b_vp); 2256 devfs_debug(DEVFS_DEBUG_WARNING, 2257 " size: %d, resid: %d, a_count: %d, valid: 0x%x\n", 2258 size, bp->b_resid, ap->a_count, m->valid); 2259 devfs_debug(DEVFS_DEBUG_WARNING, 2260 " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", 2261 nread, ap->a_reqpage, (u_long)m->pindex, pcount); 2262 /* 2263 * Free the buffer header back to the swap buffer pool. 2264 */ 2265 relpbuf(bp, NULL); 2266 return VM_PAGER_ERROR; 2267 } 2268 /* 2269 * Free the buffer header back to the swap buffer pool. 2270 */ 2271 relpbuf(bp, NULL); 2272 if (DEVFS_NODE(ap->a_vp)) 2273 vfs_timestamp(&DEVFS_NODE(ap->a_vp)->mtime); 2274 return VM_PAGER_OK; 2275 } 2276 2277 static __inline 2278 int 2279 sequential_heuristic(struct uio *uio, struct file *fp) 2280 { 2281 /* 2282 * Sequential heuristic - detect sequential operation 2283 */ 2284 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 2285 uio->uio_offset == fp->f_nextoff) { 2286 /* 2287 * XXX we assume that the filesystem block size is 2288 * the default. Not true, but still gives us a pretty 2289 * good indicator of how sequential the read operations 2290 * are. 2291 */ 2292 int tmpseq = fp->f_seqcount; 2293 2294 tmpseq += howmany(uio->uio_resid, MAXBSIZE); 2295 if (tmpseq > IO_SEQMAX) 2296 tmpseq = IO_SEQMAX; 2297 fp->f_seqcount = tmpseq; 2298 return(fp->f_seqcount << IO_SEQSHIFT); 2299 } 2300 2301 /* 2302 * Not sequential, quick draw-down of seqcount 2303 */ 2304 if (fp->f_seqcount > 1) 2305 fp->f_seqcount = 1; 2306 else 2307 fp->f_seqcount = 0; 2308 return(0); 2309 } 2310