1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2009 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Alex Hornung <ahornung@gmail.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/time.h> 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/fcntl.h> 42 #include <sys/proc.h> 43 #include <sys/priv.h> 44 #include <sys/signalvar.h> 45 #include <sys/vnode.h> 46 #include <sys/uio.h> 47 #include <sys/mount.h> 48 #include <sys/file.h> 49 #include <sys/namei.h> 50 #include <sys/dirent.h> 51 #include <sys/malloc.h> 52 #include <sys/stat.h> 53 #include <sys/reg.h> 54 #include <vm/vm_pager.h> 55 #include <vm/vm_zone.h> 56 #include <vm/vm_object.h> 57 #include <sys/filio.h> 58 #include <sys/ttycom.h> 59 #include <sys/tty.h> 60 #include <sys/diskslice.h> 61 #include <sys/sysctl.h> 62 #include <sys/devfs.h> 63 #include <sys/pioctl.h> 64 #include <vfs/fifofs/fifo.h> 65 66 #include <machine/limits.h> 67 68 #include <sys/buf2.h> 69 #include <vm/vm_page2.h> 70 71 #ifndef SPEC_CHAIN_DEBUG 72 #define SPEC_CHAIN_DEBUG 0 73 #endif 74 75 MALLOC_DECLARE(M_DEVFS); 76 #define DEVFS_BADOP (void *)devfs_vop_badop 77 78 static int devfs_vop_badop(struct vop_generic_args *); 79 static int devfs_vop_access(struct vop_access_args *); 80 static int devfs_vop_inactive(struct vop_inactive_args *); 81 static int devfs_vop_reclaim(struct vop_reclaim_args *); 82 static int devfs_vop_readdir(struct vop_readdir_args *); 83 static int devfs_vop_getattr(struct vop_getattr_args *); 84 static int devfs_vop_setattr(struct vop_setattr_args *); 85 static int devfs_vop_readlink(struct vop_readlink_args *); 86 static int devfs_vop_print(struct vop_print_args *); 87 88 static int devfs_vop_nresolve(struct vop_nresolve_args *); 89 static int devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 90 static int devfs_vop_nmkdir(struct vop_nmkdir_args *); 91 static int devfs_vop_nsymlink(struct vop_nsymlink_args *); 92 static int devfs_vop_nrmdir(struct vop_nrmdir_args *); 93 static int devfs_vop_nremove(struct vop_nremove_args *); 94 95 static int devfs_spec_open(struct vop_open_args *); 96 static int devfs_spec_close(struct vop_close_args *); 97 static int devfs_spec_fsync(struct vop_fsync_args *); 98 99 static int devfs_spec_read(struct vop_read_args *); 100 static int devfs_spec_write(struct vop_write_args *); 101 static int devfs_spec_ioctl(struct vop_ioctl_args *); 102 static int devfs_spec_kqfilter(struct vop_kqfilter_args *); 103 static int devfs_spec_strategy(struct vop_strategy_args *); 104 static void devfs_spec_strategy_done(struct bio *); 105 static int devfs_spec_freeblks(struct vop_freeblks_args *); 106 static int devfs_spec_bmap(struct vop_bmap_args *); 107 static int devfs_spec_advlock(struct vop_advlock_args *); 108 static void devfs_spec_getpages_iodone(struct bio *); 109 static int devfs_spec_getpages(struct vop_getpages_args *); 110 111 static int devfs_fo_close(struct file *); 112 static int devfs_fo_read(struct file *, struct uio *, struct ucred *, int); 113 static int devfs_fo_write(struct file *, struct uio *, struct ucred *, int); 114 static int devfs_fo_stat(struct file *, struct stat *, struct ucred *); 115 static int devfs_fo_kqfilter(struct file *, struct knote *); 116 static int devfs_fo_ioctl(struct file *, u_long, caddr_t, 117 struct ucred *, struct sysmsg *); 118 static __inline int sequential_heuristic(struct uio *, struct file *); 119 120 extern struct lock devfs_lock; 121 122 /* 123 * devfs vnode operations for regular files. All vnode ops are MPSAFE. 124 */ 125 struct vop_ops devfs_vnode_norm_vops = { 126 .vop_default = vop_defaultop, 127 .vop_access = devfs_vop_access, 128 .vop_advlock = DEVFS_BADOP, 129 .vop_bmap = DEVFS_BADOP, 130 .vop_close = vop_stdclose, 131 .vop_getattr = devfs_vop_getattr, 132 .vop_inactive = devfs_vop_inactive, 133 .vop_ncreate = DEVFS_BADOP, 134 .vop_nresolve = devfs_vop_nresolve, 135 .vop_nlookupdotdot = devfs_vop_nlookupdotdot, 136 .vop_nlink = DEVFS_BADOP, 137 .vop_nmkdir = devfs_vop_nmkdir, 138 .vop_nmknod = DEVFS_BADOP, 139 .vop_nremove = devfs_vop_nremove, 140 .vop_nrename = DEVFS_BADOP, 141 .vop_nrmdir = devfs_vop_nrmdir, 142 .vop_nsymlink = devfs_vop_nsymlink, 143 .vop_open = vop_stdopen, 144 .vop_pathconf = vop_stdpathconf, 145 .vop_print = devfs_vop_print, 146 .vop_read = DEVFS_BADOP, 147 .vop_readdir = devfs_vop_readdir, 148 .vop_readlink = devfs_vop_readlink, 149 .vop_reallocblks = DEVFS_BADOP, 150 .vop_reclaim = devfs_vop_reclaim, 151 .vop_setattr = devfs_vop_setattr, 152 .vop_write = DEVFS_BADOP, 153 .vop_ioctl = DEVFS_BADOP 154 }; 155 156 /* 157 * devfs vnode operations for character devices. All vnode ops are MPSAFE. 158 */ 159 struct vop_ops devfs_vnode_dev_vops = { 160 .vop_default = vop_defaultop, 161 .vop_access = devfs_vop_access, 162 .vop_advlock = devfs_spec_advlock, 163 .vop_bmap = devfs_spec_bmap, 164 .vop_close = devfs_spec_close, 165 .vop_freeblks = devfs_spec_freeblks, 166 .vop_fsync = devfs_spec_fsync, 167 .vop_getattr = devfs_vop_getattr, 168 .vop_getpages = devfs_spec_getpages, 169 .vop_inactive = devfs_vop_inactive, 170 .vop_open = devfs_spec_open, 171 .vop_pathconf = vop_stdpathconf, 172 .vop_print = devfs_vop_print, 173 .vop_kqfilter = devfs_spec_kqfilter, 174 .vop_read = devfs_spec_read, 175 .vop_readdir = DEVFS_BADOP, 176 .vop_readlink = DEVFS_BADOP, 177 .vop_reallocblks = DEVFS_BADOP, 178 .vop_reclaim = devfs_vop_reclaim, 179 .vop_setattr = devfs_vop_setattr, 180 .vop_strategy = devfs_spec_strategy, 181 .vop_write = devfs_spec_write, 182 .vop_ioctl = devfs_spec_ioctl 183 }; 184 185 /* 186 * devfs file pointer operations. All fileops are MPSAFE. 187 */ 188 struct vop_ops *devfs_vnode_dev_vops_p = &devfs_vnode_dev_vops; 189 190 struct fileops devfs_dev_fileops = { 191 .fo_read = devfs_fo_read, 192 .fo_write = devfs_fo_write, 193 .fo_ioctl = devfs_fo_ioctl, 194 .fo_kqfilter = devfs_fo_kqfilter, 195 .fo_stat = devfs_fo_stat, 196 .fo_close = devfs_fo_close, 197 .fo_shutdown = nofo_shutdown 198 }; 199 200 /* 201 * These two functions are possibly temporary hacks for devices (aka 202 * the pty code) which want to control the node attributes themselves. 203 * 204 * XXX we may ultimately desire to simply remove the uid/gid/mode 205 * from the node entirely. 206 * 207 * MPSAFE - sorta. Theoretically the overwrite can compete since they 208 * are loading from the same fields. 209 */ 210 static __inline void 211 node_sync_dev_get(struct devfs_node *node) 212 { 213 cdev_t dev; 214 215 if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) { 216 node->uid = dev->si_uid; 217 node->gid = dev->si_gid; 218 node->mode = dev->si_perms; 219 } 220 } 221 222 static __inline void 223 node_sync_dev_set(struct devfs_node *node) 224 { 225 cdev_t dev; 226 227 if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) { 228 dev->si_uid = node->uid; 229 dev->si_gid = node->gid; 230 dev->si_perms = node->mode; 231 } 232 } 233 234 /* 235 * generic entry point for unsupported operations 236 */ 237 static int 238 devfs_vop_badop(struct vop_generic_args *ap) 239 { 240 return (EIO); 241 } 242 243 244 static int 245 devfs_vop_access(struct vop_access_args *ap) 246 { 247 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 248 int error; 249 250 if (!devfs_node_is_accessible(node)) 251 return ENOENT; 252 node_sync_dev_get(node); 253 error = vop_helper_access(ap, node->uid, node->gid, 254 node->mode, node->flags); 255 256 return error; 257 } 258 259 260 static int 261 devfs_vop_inactive(struct vop_inactive_args *ap) 262 { 263 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 264 265 if (node == NULL || (node->flags & DEVFS_NODE_LINKED) == 0) 266 vrecycle(ap->a_vp); 267 return 0; 268 } 269 270 271 static int 272 devfs_vop_reclaim(struct vop_reclaim_args *ap) 273 { 274 struct devfs_node *node; 275 struct vnode *vp; 276 int locked; 277 278 /* 279 * Check if it is locked already. if not, we acquire the devfs lock 280 */ 281 if ((lockstatus(&devfs_lock, curthread)) != LK_EXCLUSIVE) { 282 lockmgr(&devfs_lock, LK_EXCLUSIVE); 283 locked = 1; 284 } else { 285 locked = 0; 286 } 287 288 /* 289 * Get rid of the devfs_node if it is no longer linked into the 290 * topology. Interlocked by devfs_lock. However, be careful 291 * interposing other operations between cleaning out v_data and 292 * devfs_freep() as the node is only protected by devfs_lock 293 * once the vnode is disassociated. 294 */ 295 vp = ap->a_vp; 296 node = DEVFS_NODE(vp); 297 298 if (node) { 299 if (node->v_node != vp) { 300 kprintf("NODE->V_NODE MISMATCH VP=%p NODEVP=%p\n", 301 vp, node->v_node); 302 } 303 vp->v_data = NULL; 304 node->v_node = NULL; 305 if ((node->flags & DEVFS_NODE_LINKED) == 0) 306 devfs_freep(node); 307 } 308 v_release_rdev(vp); 309 310 if (locked) 311 lockmgr(&devfs_lock, LK_RELEASE); 312 313 /* 314 * v_rdev needs to be properly released using v_release_rdev 315 * Make sure v_data is NULL as well. 316 */ 317 return 0; 318 } 319 320 321 static int 322 devfs_vop_readdir(struct vop_readdir_args *ap) 323 { 324 struct devfs_node *dnode = DEVFS_NODE(ap->a_vp); 325 struct devfs_node *node; 326 int cookie_index; 327 int ncookies; 328 int error2; 329 int error; 330 int r; 331 off_t *cookies; 332 off_t saveoff; 333 334 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_readdir() called!\n"); 335 336 if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX) 337 return (EINVAL); 338 error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM); 339 if (error) 340 return (error); 341 342 if (!devfs_node_is_accessible(dnode)) { 343 vn_unlock(ap->a_vp); 344 return ENOENT; 345 } 346 347 lockmgr(&devfs_lock, LK_EXCLUSIVE); 348 349 saveoff = ap->a_uio->uio_offset; 350 351 if (ap->a_ncookies) { 352 ncookies = ap->a_uio->uio_resid / 16 + 1; /* Why / 16 ?? */ 353 if (ncookies > 256) 354 ncookies = 256; 355 cookies = kmalloc(256 * sizeof(off_t), M_TEMP, M_WAITOK); 356 cookie_index = 0; 357 } else { 358 ncookies = -1; 359 cookies = NULL; 360 cookie_index = 0; 361 } 362 363 vfs_timestamp(&dnode->atime); 364 365 if (saveoff == 0) { 366 r = vop_write_dirent(&error, ap->a_uio, dnode->d_dir.d_ino, 367 DT_DIR, 1, "."); 368 if (r) 369 goto done; 370 if (cookies) 371 cookies[cookie_index] = saveoff; 372 saveoff++; 373 cookie_index++; 374 if (cookie_index == ncookies) 375 goto done; 376 } 377 378 if (saveoff == 1) { 379 if (dnode->parent) { 380 r = vop_write_dirent(&error, ap->a_uio, 381 dnode->parent->d_dir.d_ino, 382 DT_DIR, 2, ".."); 383 } else { 384 r = vop_write_dirent(&error, ap->a_uio, 385 dnode->d_dir.d_ino, 386 DT_DIR, 2, ".."); 387 } 388 if (r) 389 goto done; 390 if (cookies) 391 cookies[cookie_index] = saveoff; 392 saveoff++; 393 cookie_index++; 394 if (cookie_index == ncookies) 395 goto done; 396 } 397 398 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 399 if ((node->flags & DEVFS_HIDDEN) || 400 (node->flags & DEVFS_INVISIBLE)) { 401 continue; 402 } 403 404 /* 405 * If the node type is a valid devfs alias, then we make 406 * sure that the target isn't hidden. If it is, we don't 407 * show the link in the directory listing. 408 */ 409 if ((node->node_type == Nlink) && (node->link_target != NULL) && 410 (node->link_target->flags & DEVFS_HIDDEN)) 411 continue; 412 413 if (node->cookie < saveoff) 414 continue; 415 416 saveoff = node->cookie; 417 418 error2 = vop_write_dirent(&error, ap->a_uio, node->d_dir.d_ino, 419 node->d_dir.d_type, 420 node->d_dir.d_namlen, 421 node->d_dir.d_name); 422 423 if (error2) 424 break; 425 426 saveoff++; 427 428 if (cookies) 429 cookies[cookie_index] = node->cookie; 430 ++cookie_index; 431 if (cookie_index == ncookies) 432 break; 433 } 434 435 done: 436 lockmgr(&devfs_lock, LK_RELEASE); 437 vn_unlock(ap->a_vp); 438 439 ap->a_uio->uio_offset = saveoff; 440 if (error && cookie_index == 0) { 441 if (cookies) { 442 kfree(cookies, M_TEMP); 443 *ap->a_ncookies = 0; 444 *ap->a_cookies = NULL; 445 } 446 } else { 447 if (cookies) { 448 *ap->a_ncookies = cookie_index; 449 *ap->a_cookies = cookies; 450 } 451 } 452 return (error); 453 } 454 455 456 static int 457 devfs_vop_nresolve(struct vop_nresolve_args *ap) 458 { 459 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 460 struct devfs_node *node, *found = NULL; 461 struct namecache *ncp; 462 struct vnode *vp = NULL; 463 int error = 0; 464 int len; 465 int depth; 466 467 ncp = ap->a_nch->ncp; 468 len = ncp->nc_nlen; 469 470 if (!devfs_node_is_accessible(dnode)) 471 return ENOENT; 472 473 lockmgr(&devfs_lock, LK_EXCLUSIVE); 474 475 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) { 476 error = ENOENT; 477 cache_setvp(ap->a_nch, NULL); 478 goto out; 479 } 480 481 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 482 if (len == node->d_dir.d_namlen) { 483 if (!memcmp(ncp->nc_name, node->d_dir.d_name, len)) { 484 found = node; 485 break; 486 } 487 } 488 } 489 490 if (found) { 491 depth = 0; 492 while ((found->node_type == Nlink) && (found->link_target)) { 493 if (depth >= 8) { 494 devfs_debug(DEVFS_DEBUG_SHOW, "Recursive link or depth >= 8"); 495 break; 496 } 497 498 found = found->link_target; 499 ++depth; 500 } 501 502 if (!(found->flags & DEVFS_HIDDEN)) 503 devfs_allocv(/*ap->a_dvp->v_mount, */ &vp, found); 504 } 505 506 if (vp == NULL) { 507 error = ENOENT; 508 cache_setvp(ap->a_nch, NULL); 509 goto out; 510 511 } 512 KKASSERT(vp); 513 vn_unlock(vp); 514 cache_setvp(ap->a_nch, vp); 515 vrele(vp); 516 out: 517 lockmgr(&devfs_lock, LK_RELEASE); 518 519 return error; 520 } 521 522 523 static int 524 devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 525 { 526 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 527 528 *ap->a_vpp = NULL; 529 if (!devfs_node_is_accessible(dnode)) 530 return ENOENT; 531 532 lockmgr(&devfs_lock, LK_EXCLUSIVE); 533 if (dnode->parent != NULL) { 534 devfs_allocv(ap->a_vpp, dnode->parent); 535 vn_unlock(*ap->a_vpp); 536 } 537 lockmgr(&devfs_lock, LK_RELEASE); 538 539 return ((*ap->a_vpp == NULL) ? ENOENT : 0); 540 } 541 542 543 /* 544 * getattr() - Does not need a lock since the vp is refd 545 */ 546 static int 547 devfs_vop_getattr(struct vop_getattr_args *ap) 548 { 549 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 550 struct vattr *vap = ap->a_vap; 551 struct partinfo pinfo; 552 int error = 0; 553 554 #if 0 555 if (!devfs_node_is_accessible(node)) 556 return ENOENT; 557 #endif 558 559 /* 560 * XXX This is a temporary hack to prevent crashes when the device is 561 * being destroyed (and so the underlying node will be gone) while 562 * a userland program is blocked in a read(). 563 */ 564 if (node == NULL) 565 return EIO; 566 567 node_sync_dev_get(node); 568 569 /* start by zeroing out the attributes */ 570 VATTR_NULL(vap); 571 572 /* next do all the common fields */ 573 vap->va_type = ap->a_vp->v_type; 574 vap->va_mode = node->mode; 575 vap->va_fileid = DEVFS_NODE(ap->a_vp)->d_dir.d_ino ; 576 vap->va_flags = 0; 577 vap->va_blocksize = DEV_BSIZE; 578 vap->va_bytes = vap->va_size = 0; 579 580 vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; 581 582 vap->va_atime = node->atime; 583 vap->va_mtime = node->mtime; 584 vap->va_ctime = node->ctime; 585 586 vap->va_nlink = 1; /* number of references to file */ 587 588 vap->va_uid = node->uid; 589 vap->va_gid = node->gid; 590 591 vap->va_rmajor = 0; 592 vap->va_rminor = 0; 593 594 if ((node->node_type == Ndev) && node->d_dev) { 595 reference_dev(node->d_dev); 596 vap->va_rminor = node->d_dev->si_uminor; 597 release_dev(node->d_dev); 598 } 599 600 /* For a softlink the va_size is the length of the softlink */ 601 if (node->symlink_name != 0) { 602 vap->va_bytes = vap->va_size = node->symlink_namelen; 603 } 604 605 /* 606 * For a disk-type device, va_size is the size of the underlying 607 * device, so that lseek() works properly. 608 */ 609 if ((node->d_dev) && (dev_dflags(node->d_dev) & D_DISK)) { 610 bzero(&pinfo, sizeof(pinfo)); 611 error = dev_dioctl(node->d_dev, DIOCGPART, (void *)&pinfo, 612 0, proc0.p_ucred, NULL, NULL); 613 if ((error == 0) && (pinfo.media_blksize != 0)) { 614 vap->va_size = pinfo.media_size; 615 } else { 616 vap->va_size = 0; 617 error = 0; 618 } 619 } 620 621 return (error); 622 } 623 624 static int 625 devfs_vop_setattr(struct vop_setattr_args *ap) 626 { 627 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 628 struct vattr *vap; 629 uid_t cur_uid; 630 gid_t cur_gid; 631 mode_t cur_mode; 632 int error = 0; 633 634 if (!devfs_node_is_accessible(node)) 635 return ENOENT; 636 node_sync_dev_get(node); 637 638 vap = ap->a_vap; 639 640 if ((vap->va_uid != (uid_t)VNOVAL) || (vap->va_gid != (gid_t)VNOVAL)) { 641 cur_uid = node->uid; 642 cur_gid = node->gid; 643 cur_mode = node->mode; 644 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 645 ap->a_cred, &cur_uid, &cur_gid, &cur_mode); 646 if (error) 647 goto out; 648 649 if (node->uid != cur_uid || node->gid != cur_gid) { 650 node->uid = cur_uid; 651 node->gid = cur_gid; 652 node->mode = cur_mode; 653 } 654 } 655 656 if (vap->va_mode != (mode_t)VNOVAL) { 657 cur_mode = node->mode; 658 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 659 node->uid, node->gid, &cur_mode); 660 if (error == 0 && node->mode != cur_mode) { 661 node->mode = cur_mode; 662 } 663 } 664 665 out: 666 node_sync_dev_set(node); 667 vfs_timestamp(&node->ctime); 668 669 return error; 670 } 671 672 673 static int 674 devfs_vop_readlink(struct vop_readlink_args *ap) 675 { 676 struct devfs_node *node = DEVFS_NODE(ap->a_vp); 677 int ret; 678 679 if (!devfs_node_is_accessible(node)) 680 return ENOENT; 681 682 lockmgr(&devfs_lock, LK_SHARED); 683 ret = uiomove(node->symlink_name, node->symlink_namelen, ap->a_uio); 684 lockmgr(&devfs_lock, LK_RELEASE); 685 686 return ret; 687 } 688 689 690 static int 691 devfs_vop_print(struct vop_print_args *ap) 692 { 693 return (0); 694 } 695 696 static int 697 devfs_vop_nmkdir(struct vop_nmkdir_args *ap) 698 { 699 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 700 struct devfs_node *node; 701 702 if (!devfs_node_is_accessible(dnode)) 703 return ENOENT; 704 705 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 706 goto out; 707 708 lockmgr(&devfs_lock, LK_EXCLUSIVE); 709 devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Ndir, 710 ap->a_nch->ncp->nc_name, dnode, NULL); 711 712 if (*ap->a_vpp) { 713 node = DEVFS_NODE(*ap->a_vpp); 714 node->flags |= DEVFS_USER_CREATED; 715 cache_setunresolved(ap->a_nch); 716 cache_setvp(ap->a_nch, *ap->a_vpp); 717 } 718 lockmgr(&devfs_lock, LK_RELEASE); 719 out: 720 return ((*ap->a_vpp == NULL) ? ENOTDIR : 0); 721 } 722 723 static int 724 devfs_vop_nsymlink(struct vop_nsymlink_args *ap) 725 { 726 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 727 struct devfs_node *node; 728 size_t targetlen; 729 730 if (!devfs_node_is_accessible(dnode)) 731 return ENOENT; 732 733 ap->a_vap->va_type = VLNK; 734 735 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 736 goto out; 737 738 lockmgr(&devfs_lock, LK_EXCLUSIVE); 739 devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Nlink, 740 ap->a_nch->ncp->nc_name, dnode, NULL); 741 742 targetlen = strlen(ap->a_target); 743 if (*ap->a_vpp) { 744 node = DEVFS_NODE(*ap->a_vpp); 745 node->flags |= DEVFS_USER_CREATED; 746 node->symlink_namelen = targetlen; 747 node->symlink_name = kmalloc(targetlen + 1, M_DEVFS, M_WAITOK); 748 memcpy(node->symlink_name, ap->a_target, targetlen); 749 node->symlink_name[targetlen] = '\0'; 750 cache_setunresolved(ap->a_nch); 751 cache_setvp(ap->a_nch, *ap->a_vpp); 752 } 753 lockmgr(&devfs_lock, LK_RELEASE); 754 out: 755 return ((*ap->a_vpp == NULL) ? ENOTDIR : 0); 756 } 757 758 static int 759 devfs_vop_nrmdir(struct vop_nrmdir_args *ap) 760 { 761 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 762 struct devfs_node *node; 763 struct namecache *ncp; 764 int error = ENOENT; 765 766 ncp = ap->a_nch->ncp; 767 768 if (!devfs_node_is_accessible(dnode)) 769 return ENOENT; 770 771 lockmgr(&devfs_lock, LK_EXCLUSIVE); 772 773 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 774 goto out; 775 776 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 777 if (ncp->nc_nlen != node->d_dir.d_namlen) 778 continue; 779 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen)) 780 continue; 781 782 /* 783 * only allow removal of user created dirs 784 */ 785 if ((node->flags & DEVFS_USER_CREATED) == 0) { 786 error = EPERM; 787 goto out; 788 } else if (node->node_type != Ndir) { 789 error = ENOTDIR; 790 goto out; 791 } else if (node->nchildren > 2) { 792 error = ENOTEMPTY; 793 goto out; 794 } else { 795 if (node->v_node) 796 cache_inval_vp(node->v_node, CINV_DESTROY); 797 devfs_unlinkp(node); 798 error = 0; 799 break; 800 } 801 } 802 803 cache_unlink(ap->a_nch); 804 out: 805 lockmgr(&devfs_lock, LK_RELEASE); 806 return error; 807 } 808 809 static int 810 devfs_vop_nremove(struct vop_nremove_args *ap) 811 { 812 struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp); 813 struct devfs_node *node; 814 struct namecache *ncp; 815 int error = ENOENT; 816 817 ncp = ap->a_nch->ncp; 818 819 if (!devfs_node_is_accessible(dnode)) 820 return ENOENT; 821 822 lockmgr(&devfs_lock, LK_EXCLUSIVE); 823 824 if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) 825 goto out; 826 827 TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) { 828 if (ncp->nc_nlen != node->d_dir.d_namlen) 829 continue; 830 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen)) 831 continue; 832 833 /* 834 * only allow removal of user created stuff (e.g. symlinks) 835 */ 836 if ((node->flags & DEVFS_USER_CREATED) == 0) { 837 error = EPERM; 838 goto out; 839 } else if (node->node_type == Ndir) { 840 error = EISDIR; 841 goto out; 842 } else { 843 if (node->v_node) 844 cache_inval_vp(node->v_node, CINV_DESTROY); 845 devfs_unlinkp(node); 846 error = 0; 847 break; 848 } 849 } 850 851 cache_unlink(ap->a_nch); 852 out: 853 lockmgr(&devfs_lock, LK_RELEASE); 854 return error; 855 } 856 857 858 static int 859 devfs_spec_open(struct vop_open_args *ap) 860 { 861 struct vnode *vp = ap->a_vp; 862 struct vnode *orig_vp = NULL; 863 struct devfs_node *node = DEVFS_NODE(vp); 864 struct devfs_node *newnode; 865 cdev_t dev, ndev = NULL; 866 int error = 0; 867 868 if (node) { 869 if (node->d_dev == NULL) 870 return ENXIO; 871 if (!devfs_node_is_accessible(node)) 872 return ENOENT; 873 } 874 875 if ((dev = vp->v_rdev) == NULL) 876 return ENXIO; 877 878 /* 879 * Simple devices that don't care. Retain the shared lock. 880 */ 881 if (dev_dflags(dev) & D_QUICK) { 882 vn_unlock(vp); 883 error = dev_dopen(dev, ap->a_mode, S_IFCHR, 884 ap->a_cred, ap->a_fp, vp); 885 vn_lock(vp, LK_SHARED | LK_RETRY); 886 vop_stdopen(ap); 887 goto skip; 888 } 889 890 /* 891 * Slow code 892 */ 893 vn_lock(vp, LK_UPGRADE | LK_RETRY); 894 if (node && ap->a_fp) { 895 int exists; 896 897 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_open: -1.1-\n"); 898 lockmgr(&devfs_lock, LK_SHARED); 899 900 ndev = devfs_clone(dev, node->d_dir.d_name, 901 node->d_dir.d_namlen, 902 ap->a_mode, ap->a_cred); 903 if (ndev != NULL) { 904 lockmgr(&devfs_lock, LK_RELEASE); 905 lockmgr(&devfs_lock, LK_EXCLUSIVE); 906 newnode = devfs_create_device_node( 907 DEVFS_MNTDATA(vp->v_mount)->root_node, 908 ndev, &exists, NULL, NULL); 909 /* XXX: possibly destroy device if this happens */ 910 911 if (newnode != NULL) { 912 dev = ndev; 913 if (exists == 0) 914 devfs_link_dev(dev); 915 916 devfs_debug(DEVFS_DEBUG_DEBUG, 917 "parent here is: %s, node is: |%s|\n", 918 ((node->parent->node_type == Nroot) ? 919 "ROOT!" : node->parent->d_dir.d_name), 920 newnode->d_dir.d_name); 921 devfs_debug(DEVFS_DEBUG_DEBUG, 922 "test: %s\n", 923 ((struct devfs_node *)(TAILQ_LAST(DEVFS_DENODE_HEAD(node->parent), devfs_node_head)))->d_dir.d_name); 924 925 /* 926 * orig_vp is set to the original vp if we 927 * cloned. 928 */ 929 /* node->flags |= DEVFS_CLONED; */ 930 devfs_allocv(&vp, newnode); 931 orig_vp = ap->a_vp; 932 ap->a_vp = vp; 933 } 934 } 935 lockmgr(&devfs_lock, LK_RELEASE); 936 937 /* 938 * Synchronize devfs here to make sure that, if the cloned 939 * device creates other device nodes in addition to the 940 * cloned one, all of them are created by the time we return 941 * from opening the cloned one. 942 */ 943 if (ndev) 944 devfs_config(); 945 } 946 947 devfs_debug(DEVFS_DEBUG_DEBUG, 948 "devfs_spec_open() called on %s! \n", 949 dev->si_name); 950 951 /* 952 * Make this field valid before any I/O in ->d_open 953 * 954 * NOTE: Shared vnode lock probably held, but its ok as long 955 * as assignments are consistent. 956 */ 957 if (!dev->si_iosize_max) 958 /* XXX: old DFLTPHYS == 64KB dependency */ 959 dev->si_iosize_max = min(MAXPHYS,64*1024); 960 961 if (dev_dflags(dev) & D_TTY) 962 vsetflags(vp, VISTTY); 963 964 /* 965 * Open the underlying device 966 */ 967 vn_unlock(vp); 968 error = dev_dopen(dev, ap->a_mode, S_IFCHR, ap->a_cred, ap->a_fp, vp); 969 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 970 971 /* 972 * Clean up any cloned vp if we error out. 973 */ 974 if (error) { 975 if (orig_vp) { 976 vput(vp); 977 ap->a_vp = orig_vp; 978 /* orig_vp = NULL; */ 979 } 980 return error; 981 } 982 983 /* 984 * This checks if the disk device is going to be opened for writing. 985 * It will be only allowed in the cases where securelevel permits it 986 * and it's not mounted R/W. 987 */ 988 if ((dev_dflags(dev) & D_DISK) && (ap->a_mode & FWRITE) && 989 (ap->a_cred != FSCRED)) { 990 991 /* Very secure mode. No open for writing allowed */ 992 if (securelevel >= 2) 993 return EPERM; 994 995 /* 996 * If it is mounted R/W, do not allow to open for writing. 997 * In the case it's mounted read-only but securelevel 998 * is >= 1, then do not allow opening for writing either. 999 */ 1000 if (vfs_mountedon(vp)) { 1001 if (!(dev->si_mountpoint->mnt_flag & MNT_RDONLY)) 1002 return EBUSY; 1003 else if (securelevel >= 1) 1004 return EPERM; 1005 } 1006 } 1007 1008 /* 1009 * NOTE: vnode is still locked shared. t_stop assignment should 1010 * remain consistent so we should be ok. 1011 */ 1012 if (dev_dflags(dev) & D_TTY) { 1013 if (dev->si_tty) { 1014 struct tty *tp; 1015 tp = dev->si_tty; 1016 if (!tp->t_stop) { 1017 devfs_debug(DEVFS_DEBUG_DEBUG, 1018 "devfs: no t_stop\n"); 1019 tp->t_stop = nottystop; 1020 } 1021 } 1022 } 1023 1024 /* 1025 * NOTE: vnode is still locked shared. assignments should 1026 * remain consistent so we should be ok. However, 1027 * upgrade to exclusive if we need a VM object. 1028 */ 1029 if (vn_isdisk(vp, NULL)) { 1030 if (!dev->si_bsize_phys) 1031 dev->si_bsize_phys = DEV_BSIZE; 1032 vinitvmio(vp, IDX_TO_OFF(INT_MAX), PAGE_SIZE, -1); 1033 } 1034 1035 vop_stdopen(ap); 1036 #if 0 1037 if (node) 1038 vfs_timestamp(&node->atime); 1039 #endif 1040 /* 1041 * If we replaced the vp the vop_stdopen() call will have loaded 1042 * it into fp->f_data and vref()d the vp, giving us two refs. So 1043 * instead of just unlocking it here we have to vput() it. 1044 */ 1045 if (orig_vp) 1046 vput(vp); 1047 1048 /* Ugly pty magic, to make pty devices appear once they are opened */ 1049 if (node && (node->flags & DEVFS_PTY) == DEVFS_PTY) { 1050 if (node->flags & DEVFS_INVISIBLE) 1051 node->flags &= ~DEVFS_INVISIBLE; 1052 } 1053 1054 skip: 1055 if (ap->a_fp) { 1056 KKASSERT(ap->a_fp->f_type == DTYPE_VNODE); 1057 KKASSERT((ap->a_fp->f_flag & FMASK) == (ap->a_mode & FMASK)); 1058 ap->a_fp->f_ops = &devfs_dev_fileops; 1059 KKASSERT(ap->a_fp->f_data == (void *)vp); 1060 } 1061 1062 return 0; 1063 } 1064 1065 static int 1066 devfs_spec_close(struct vop_close_args *ap) 1067 { 1068 struct devfs_node *node; 1069 struct proc *p = curproc; 1070 struct vnode *vp = ap->a_vp; 1071 cdev_t dev = vp->v_rdev; 1072 int error = 0; 1073 int needrelock; 1074 int opencount; 1075 1076 /* 1077 * Devices flagged D_QUICK require no special handling. 1078 */ 1079 if (dev && dev_dflags(dev) & D_QUICK) { 1080 opencount = vp->v_opencount; 1081 if (opencount <= 1) 1082 opencount = count_dev(dev); /* XXX NOT SMP SAFE */ 1083 if (((vp->v_flag & VRECLAIMED) || 1084 (dev_dflags(dev) & D_TRACKCLOSE) || 1085 (opencount == 1))) { 1086 vn_unlock(vp); 1087 error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp); 1088 vn_lock(vp, LK_SHARED | LK_RETRY); 1089 } 1090 goto skip; 1091 } 1092 1093 /* 1094 * We do special tests on the opencount so unfortunately we need 1095 * an exclusive lock. 1096 */ 1097 vn_lock(vp, LK_UPGRADE | LK_RETRY); 1098 1099 if (dev) 1100 devfs_debug(DEVFS_DEBUG_DEBUG, 1101 "devfs_spec_close() called on %s! \n", 1102 dev->si_name); 1103 else 1104 devfs_debug(DEVFS_DEBUG_DEBUG, 1105 "devfs_spec_close() called, null vode!\n"); 1106 1107 /* 1108 * A couple of hacks for devices and tty devices. The 1109 * vnode ref count cannot be used to figure out the 1110 * last close, but we can use v_opencount now that 1111 * revoke works properly. 1112 * 1113 * Detect the last close on a controlling terminal and clear 1114 * the session (half-close). 1115 * 1116 * XXX opencount is not SMP safe. The vnode is locked but there 1117 * may be multiple vnodes referencing the same device. 1118 */ 1119 if (dev) { 1120 /* 1121 * NOTE: Try to avoid global tokens when testing opencount 1122 * XXX hack, fixme. needs a struct lock and opencount in 1123 * struct cdev itself. 1124 */ 1125 reference_dev(dev); 1126 opencount = vp->v_opencount; 1127 if (opencount <= 1) 1128 opencount = count_dev(dev); /* XXX NOT SMP SAFE */ 1129 } else { 1130 opencount = 0; 1131 } 1132 1133 if (p && vp->v_opencount <= 1 && vp == p->p_session->s_ttyvp) { 1134 p->p_session->s_ttyvp = NULL; 1135 vrele(vp); 1136 } 1137 1138 /* 1139 * Vnodes can be opened and closed multiple times. Do not really 1140 * close the device unless (1) it is being closed forcibly, 1141 * (2) the device wants to track closes, or (3) this is the last 1142 * vnode doing its last close on the device. 1143 * 1144 * XXX the VXLOCK (force close) case can leave vnodes referencing 1145 * a closed device. This might not occur now that our revoke is 1146 * fixed. 1147 */ 1148 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -1- \n"); 1149 if (dev && ((vp->v_flag & VRECLAIMED) || 1150 (dev_dflags(dev) & D_TRACKCLOSE) || 1151 (opencount == 1))) { 1152 /* 1153 * Ugly pty magic, to make pty devices disappear again once 1154 * they are closed. 1155 */ 1156 node = DEVFS_NODE(ap->a_vp); 1157 if (node && (node->flags & DEVFS_PTY)) 1158 node->flags |= DEVFS_INVISIBLE; 1159 1160 /* 1161 * Unlock around dev_dclose(), unless the vnode is 1162 * undergoing a vgone/reclaim (during umount). 1163 */ 1164 needrelock = 0; 1165 if ((vp->v_flag & VRECLAIMED) == 0 && vn_islocked(vp)) { 1166 needrelock = 1; 1167 vn_unlock(vp); 1168 } 1169 1170 /* 1171 * WARNING! If the device destroys itself the devfs node 1172 * can disappear here. 1173 * 1174 * WARNING! vn_lock() will fail if the vp is in a VRECLAIM, 1175 * which can occur during umount. 1176 */ 1177 error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp); 1178 /* node is now stale */ 1179 1180 if (needrelock) { 1181 if (vn_lock(vp, LK_EXCLUSIVE | 1182 LK_RETRY | 1183 LK_FAILRECLAIM) != 0) { 1184 panic("devfs_spec_close: vnode %p " 1185 "unexpectedly could not be relocked", 1186 vp); 1187 } 1188 } 1189 } else { 1190 error = 0; 1191 } 1192 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -2- \n"); 1193 1194 /* 1195 * Track the actual opens and closes on the vnode. The last close 1196 * disassociates the rdev. If the rdev is already disassociated or 1197 * the opencount is already 0, the vnode might have been revoked 1198 * and no further opencount tracking occurs. 1199 */ 1200 if (dev) 1201 release_dev(dev); 1202 skip: 1203 if (vp->v_opencount > 0) 1204 vop_stdclose(ap); 1205 return(error); 1206 1207 } 1208 1209 1210 static int 1211 devfs_fo_close(struct file *fp) 1212 { 1213 struct vnode *vp = (struct vnode *)fp->f_data; 1214 int error; 1215 1216 fp->f_ops = &badfileops; 1217 error = vn_close(vp, fp->f_flag, fp); 1218 devfs_clear_cdevpriv(fp); 1219 1220 return (error); 1221 } 1222 1223 1224 /* 1225 * Device-optimized file table vnode read routine. 1226 * 1227 * This bypasses the VOP table and talks directly to the device. Most 1228 * filesystems just route to specfs and can make this optimization. 1229 */ 1230 static int 1231 devfs_fo_read(struct file *fp, struct uio *uio, 1232 struct ucred *cred, int flags) 1233 { 1234 struct devfs_node *node; 1235 struct vnode *vp; 1236 int ioflag; 1237 int error; 1238 cdev_t dev; 1239 1240 KASSERT(uio->uio_td == curthread, 1241 ("uio_td %p is not td %p", uio->uio_td, curthread)); 1242 1243 if (uio->uio_resid == 0) 1244 return 0; 1245 1246 vp = (struct vnode *)fp->f_data; 1247 if (vp == NULL || vp->v_type == VBAD) 1248 return EBADF; 1249 1250 node = DEVFS_NODE(vp); 1251 1252 if ((dev = vp->v_rdev) == NULL) 1253 return EBADF; 1254 1255 reference_dev(dev); 1256 1257 if ((flags & O_FOFFSET) == 0) 1258 uio->uio_offset = fp->f_offset; 1259 1260 ioflag = 0; 1261 if (flags & O_FBLOCKING) { 1262 /* ioflag &= ~IO_NDELAY; */ 1263 } else if (flags & O_FNONBLOCKING) { 1264 ioflag |= IO_NDELAY; 1265 } else if (fp->f_flag & FNONBLOCK) { 1266 ioflag |= IO_NDELAY; 1267 } 1268 if (fp->f_flag & O_DIRECT) { 1269 ioflag |= IO_DIRECT; 1270 } 1271 ioflag |= sequential_heuristic(uio, fp); 1272 1273 error = dev_dread(dev, uio, ioflag, fp); 1274 1275 release_dev(dev); 1276 if (node) 1277 vfs_timestamp(&node->atime); 1278 if ((flags & O_FOFFSET) == 0) 1279 fp->f_offset = uio->uio_offset; 1280 fp->f_nextoff = uio->uio_offset; 1281 1282 return (error); 1283 } 1284 1285 1286 static int 1287 devfs_fo_write(struct file *fp, struct uio *uio, 1288 struct ucred *cred, int flags) 1289 { 1290 struct devfs_node *node; 1291 struct vnode *vp; 1292 int ioflag; 1293 int error; 1294 cdev_t dev; 1295 1296 KASSERT(uio->uio_td == curthread, 1297 ("uio_td %p is not p %p", uio->uio_td, curthread)); 1298 1299 vp = (struct vnode *)fp->f_data; 1300 if (vp == NULL || vp->v_type == VBAD) 1301 return EBADF; 1302 1303 node = DEVFS_NODE(vp); 1304 1305 if (vp->v_type == VREG) 1306 bwillwrite(uio->uio_resid); 1307 1308 vp = (struct vnode *)fp->f_data; 1309 1310 if ((dev = vp->v_rdev) == NULL) 1311 return EBADF; 1312 1313 reference_dev(dev); 1314 1315 if ((flags & O_FOFFSET) == 0) 1316 uio->uio_offset = fp->f_offset; 1317 1318 ioflag = IO_UNIT; 1319 if (vp->v_type == VREG && 1320 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 1321 ioflag |= IO_APPEND; 1322 } 1323 1324 if (flags & O_FBLOCKING) { 1325 /* ioflag &= ~IO_NDELAY; */ 1326 } else if (flags & O_FNONBLOCKING) { 1327 ioflag |= IO_NDELAY; 1328 } else if (fp->f_flag & FNONBLOCK) { 1329 ioflag |= IO_NDELAY; 1330 } 1331 if (fp->f_flag & O_DIRECT) { 1332 ioflag |= IO_DIRECT; 1333 } 1334 if (flags & O_FASYNCWRITE) { 1335 /* ioflag &= ~IO_SYNC; */ 1336 } else if (flags & O_FSYNCWRITE) { 1337 ioflag |= IO_SYNC; 1338 } else if (fp->f_flag & O_FSYNC) { 1339 ioflag |= IO_SYNC; 1340 } 1341 1342 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 1343 ioflag |= IO_SYNC; 1344 ioflag |= sequential_heuristic(uio, fp); 1345 1346 error = dev_dwrite(dev, uio, ioflag, fp); 1347 1348 release_dev(dev); 1349 if (node) { 1350 vfs_timestamp(&node->atime); 1351 vfs_timestamp(&node->mtime); 1352 } 1353 1354 if ((flags & O_FOFFSET) == 0) 1355 fp->f_offset = uio->uio_offset; 1356 fp->f_nextoff = uio->uio_offset; 1357 1358 return (error); 1359 } 1360 1361 1362 static int 1363 devfs_fo_stat(struct file *fp, struct stat *sb, struct ucred *cred) 1364 { 1365 struct vnode *vp; 1366 struct vattr vattr; 1367 struct vattr *vap; 1368 u_short mode; 1369 cdev_t dev; 1370 int error; 1371 1372 vp = (struct vnode *)fp->f_data; 1373 if (vp == NULL || vp->v_type == VBAD) 1374 return EBADF; 1375 1376 error = vn_stat(vp, sb, cred); 1377 if (error) 1378 return (error); 1379 1380 vap = &vattr; 1381 error = VOP_GETATTR(vp, vap); 1382 if (error) 1383 return (error); 1384 1385 /* 1386 * Zero the spare stat fields 1387 */ 1388 sb->st_lspare = 0; 1389 sb->st_qspare2 = 0; 1390 1391 /* 1392 * Copy from vattr table ... or not in case it's a cloned device 1393 */ 1394 if (vap->va_fsid != VNOVAL) 1395 sb->st_dev = vap->va_fsid; 1396 else 1397 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 1398 1399 sb->st_ino = vap->va_fileid; 1400 1401 mode = vap->va_mode; 1402 mode |= S_IFCHR; 1403 sb->st_mode = mode; 1404 1405 if (vap->va_nlink > (nlink_t)-1) 1406 sb->st_nlink = (nlink_t)-1; 1407 else 1408 sb->st_nlink = vap->va_nlink; 1409 1410 sb->st_uid = vap->va_uid; 1411 sb->st_gid = vap->va_gid; 1412 sb->st_rdev = devid_from_dev(DEVFS_NODE(vp)->d_dev); 1413 sb->st_size = vap->va_bytes; 1414 sb->st_atimespec = vap->va_atime; 1415 sb->st_mtimespec = vap->va_mtime; 1416 sb->st_ctimespec = vap->va_ctime; 1417 1418 /* 1419 * A VCHR and VBLK device may track the last access and last modified 1420 * time independantly of the filesystem. This is particularly true 1421 * because device read and write calls may bypass the filesystem. 1422 */ 1423 if (vp->v_type == VCHR || vp->v_type == VBLK) { 1424 dev = vp->v_rdev; 1425 if (dev != NULL) { 1426 if (dev->si_lastread) { 1427 sb->st_atimespec.tv_sec = time_second + 1428 (time_uptime - 1429 dev->si_lastread); 1430 sb->st_atimespec.tv_nsec = 0; 1431 } 1432 if (dev->si_lastwrite) { 1433 sb->st_atimespec.tv_sec = time_second + 1434 (time_uptime - 1435 dev->si_lastwrite); 1436 sb->st_atimespec.tv_nsec = 0; 1437 } 1438 } 1439 } 1440 1441 /* 1442 * According to www.opengroup.org, the meaning of st_blksize is 1443 * "a filesystem-specific preferred I/O block size for this 1444 * object. In some filesystem types, this may vary from file 1445 * to file" 1446 * Default to PAGE_SIZE after much discussion. 1447 */ 1448 1449 sb->st_blksize = PAGE_SIZE; 1450 1451 sb->st_flags = vap->va_flags; 1452 1453 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0); 1454 if (error) 1455 sb->st_gen = 0; 1456 else 1457 sb->st_gen = (u_int32_t)vap->va_gen; 1458 1459 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1460 1461 /* 1462 * This is for ABI compatibility <= 5.7 (for ABI change made in 1463 * 5.7 master). 1464 */ 1465 sb->__old_st_blksize = sb->st_blksize; 1466 1467 return (0); 1468 } 1469 1470 1471 static int 1472 devfs_fo_kqfilter(struct file *fp, struct knote *kn) 1473 { 1474 struct vnode *vp; 1475 int error; 1476 cdev_t dev; 1477 1478 vp = (struct vnode *)fp->f_data; 1479 if (vp == NULL || vp->v_type == VBAD) { 1480 error = EBADF; 1481 goto done; 1482 } 1483 if ((dev = vp->v_rdev) == NULL) { 1484 error = EBADF; 1485 goto done; 1486 } 1487 reference_dev(dev); 1488 1489 error = dev_dkqfilter(dev, kn, fp); 1490 1491 release_dev(dev); 1492 1493 done: 1494 return (error); 1495 } 1496 1497 static int 1498 devfs_fo_ioctl(struct file *fp, u_long com, caddr_t data, 1499 struct ucred *ucred, struct sysmsg *msg) 1500 { 1501 #if 0 1502 struct devfs_node *node; 1503 #endif 1504 struct vnode *vp; 1505 struct vnode *ovp; 1506 cdev_t dev; 1507 int error; 1508 struct fiodname_args *name_args; 1509 size_t namlen; 1510 const char *name; 1511 1512 vp = ((struct vnode *)fp->f_data); 1513 1514 if ((dev = vp->v_rdev) == NULL) 1515 return EBADF; /* device was revoked */ 1516 1517 reference_dev(dev); 1518 1519 #if 0 1520 node = DEVFS_NODE(vp); 1521 #endif 1522 1523 devfs_debug(DEVFS_DEBUG_DEBUG, 1524 "devfs_fo_ioctl() called! for dev %s\n", 1525 dev->si_name); 1526 1527 if (com == FIODTYPE) { 1528 *(int *)data = dev_dflags(dev) & D_TYPEMASK; 1529 error = 0; 1530 goto out; 1531 } else if (com == FIODNAME) { 1532 name_args = (struct fiodname_args *)data; 1533 name = dev->si_name; 1534 namlen = strlen(name) + 1; 1535 1536 devfs_debug(DEVFS_DEBUG_DEBUG, 1537 "ioctl, got: FIODNAME for %s\n", name); 1538 1539 if (namlen <= name_args->len) 1540 error = copyout(dev->si_name, name_args->name, namlen); 1541 else 1542 error = EINVAL; 1543 1544 devfs_debug(DEVFS_DEBUG_DEBUG, 1545 "ioctl stuff: error: %d\n", error); 1546 goto out; 1547 } 1548 1549 error = dev_dioctl(dev, com, data, fp->f_flag, ucred, msg, fp); 1550 1551 #if 0 1552 if (node) { 1553 vfs_timestamp(&node->atime); 1554 vfs_timestamp(&node->mtime); 1555 } 1556 #endif 1557 if (com == TIOCSCTTY) { 1558 devfs_debug(DEVFS_DEBUG_DEBUG, 1559 "devfs_fo_ioctl: got TIOCSCTTY on %s\n", 1560 dev->si_name); 1561 } 1562 if (error == 0 && com == TIOCSCTTY) { 1563 struct proc *p = curthread->td_proc; 1564 struct session *sess; 1565 1566 devfs_debug(DEVFS_DEBUG_DEBUG, 1567 "devfs_fo_ioctl: dealing with TIOCSCTTY on %s\n", 1568 dev->si_name); 1569 if (p == NULL) { 1570 error = ENOTTY; 1571 goto out; 1572 } 1573 sess = p->p_session; 1574 1575 /* 1576 * Do nothing if reassigning same control tty 1577 */ 1578 if (sess->s_ttyvp == vp) { 1579 error = 0; 1580 goto out; 1581 } 1582 1583 /* 1584 * Get rid of reference to old control tty 1585 */ 1586 ovp = sess->s_ttyvp; 1587 vref(vp); 1588 sess->s_ttyvp = vp; 1589 if (ovp) 1590 vrele(ovp); 1591 } 1592 1593 out: 1594 release_dev(dev); 1595 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_fo_ioctl() finished! \n"); 1596 return (error); 1597 } 1598 1599 1600 static int 1601 devfs_spec_fsync(struct vop_fsync_args *ap) 1602 { 1603 struct vnode *vp = ap->a_vp; 1604 int error; 1605 1606 if (!vn_isdisk(vp, NULL)) 1607 return (0); 1608 1609 /* 1610 * Flush all dirty buffers associated with a block device. 1611 */ 1612 error = vfsync(vp, ap->a_waitfor, 10000, NULL, NULL); 1613 return (error); 1614 } 1615 1616 static int 1617 devfs_spec_read(struct vop_read_args *ap) 1618 { 1619 struct devfs_node *node; 1620 struct vnode *vp; 1621 struct uio *uio; 1622 cdev_t dev; 1623 int error; 1624 1625 vp = ap->a_vp; 1626 dev = vp->v_rdev; 1627 uio = ap->a_uio; 1628 node = DEVFS_NODE(vp); 1629 1630 if (dev == NULL) /* device was revoked */ 1631 return (EBADF); 1632 if (uio->uio_resid == 0) 1633 return (0); 1634 1635 vn_unlock(vp); 1636 error = dev_dread(dev, uio, ap->a_ioflag, NULL); 1637 vn_lock(vp, LK_SHARED | LK_RETRY); 1638 1639 if (node) 1640 vfs_timestamp(&node->atime); 1641 1642 return (error); 1643 } 1644 1645 /* 1646 * Vnode op for write 1647 * 1648 * spec_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag, 1649 * struct ucred *a_cred) 1650 */ 1651 static int 1652 devfs_spec_write(struct vop_write_args *ap) 1653 { 1654 struct devfs_node *node; 1655 struct vnode *vp; 1656 struct uio *uio; 1657 cdev_t dev; 1658 int error; 1659 1660 vp = ap->a_vp; 1661 dev = vp->v_rdev; 1662 uio = ap->a_uio; 1663 node = DEVFS_NODE(vp); 1664 1665 KKASSERT(uio->uio_segflg != UIO_NOCOPY); 1666 1667 if (dev == NULL) /* device was revoked */ 1668 return (EBADF); 1669 1670 vn_unlock(vp); 1671 error = dev_dwrite(dev, uio, ap->a_ioflag, NULL); 1672 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1673 1674 if (node) { 1675 vfs_timestamp(&node->atime); 1676 vfs_timestamp(&node->mtime); 1677 } 1678 1679 return (error); 1680 } 1681 1682 /* 1683 * Device ioctl operation. 1684 * 1685 * spec_ioctl(struct vnode *a_vp, int a_command, caddr_t a_data, 1686 * int a_fflag, struct ucred *a_cred, struct sysmsg *msg) 1687 */ 1688 static int 1689 devfs_spec_ioctl(struct vop_ioctl_args *ap) 1690 { 1691 struct vnode *vp = ap->a_vp; 1692 #if 0 1693 struct devfs_node *node; 1694 #endif 1695 cdev_t dev; 1696 1697 if ((dev = vp->v_rdev) == NULL) 1698 return (EBADF); /* device was revoked */ 1699 #if 0 1700 node = DEVFS_NODE(vp); 1701 1702 if (node) { 1703 vfs_timestamp(&node->atime); 1704 vfs_timestamp(&node->mtime); 1705 } 1706 #endif 1707 1708 return (dev_dioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, 1709 ap->a_cred, ap->a_sysmsg, NULL)); 1710 } 1711 1712 /* 1713 * spec_kqfilter(struct vnode *a_vp, struct knote *a_kn) 1714 */ 1715 /* ARGSUSED */ 1716 static int 1717 devfs_spec_kqfilter(struct vop_kqfilter_args *ap) 1718 { 1719 struct vnode *vp = ap->a_vp; 1720 #if 0 1721 struct devfs_node *node; 1722 #endif 1723 cdev_t dev; 1724 1725 if ((dev = vp->v_rdev) == NULL) 1726 return (EBADF); /* device was revoked (EBADF) */ 1727 #if 0 1728 node = DEVFS_NODE(vp); 1729 1730 if (node) 1731 vfs_timestamp(&node->atime); 1732 #endif 1733 1734 return (dev_dkqfilter(dev, ap->a_kn, NULL)); 1735 } 1736 1737 /* 1738 * Convert a vnode strategy call into a device strategy call. Vnode strategy 1739 * calls are not limited to device DMA limits so we have to deal with the 1740 * case. 1741 * 1742 * spec_strategy(struct vnode *a_vp, struct bio *a_bio) 1743 */ 1744 static int 1745 devfs_spec_strategy(struct vop_strategy_args *ap) 1746 { 1747 struct bio *bio = ap->a_bio; 1748 struct buf *bp = bio->bio_buf; 1749 struct buf *nbp; 1750 struct vnode *vp; 1751 struct mount *mp; 1752 int chunksize; 1753 int maxiosize; 1754 1755 if (bp->b_cmd != BUF_CMD_READ && LIST_FIRST(&bp->b_dep) != NULL) 1756 buf_start(bp); 1757 1758 /* 1759 * Collect statistics on synchronous and asynchronous read 1760 * and write counts for disks that have associated filesystems. 1761 */ 1762 vp = ap->a_vp; 1763 KKASSERT(vp->v_rdev != NULL); /* XXX */ 1764 if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) { 1765 if (bp->b_cmd == BUF_CMD_READ) { 1766 if (bp->b_flags & BIO_SYNC) 1767 mp->mnt_stat.f_syncreads++; 1768 else 1769 mp->mnt_stat.f_asyncreads++; 1770 } else { 1771 if (bp->b_flags & BIO_SYNC) 1772 mp->mnt_stat.f_syncwrites++; 1773 else 1774 mp->mnt_stat.f_asyncwrites++; 1775 } 1776 } 1777 1778 /* 1779 * Device iosize limitations only apply to read and write. Shortcut 1780 * the I/O if it fits. 1781 */ 1782 if ((maxiosize = vp->v_rdev->si_iosize_max) == 0) { 1783 devfs_debug(DEVFS_DEBUG_DEBUG, 1784 "%s: si_iosize_max not set!\n", 1785 dev_dname(vp->v_rdev)); 1786 maxiosize = MAXPHYS; 1787 } 1788 #if SPEC_CHAIN_DEBUG & 2 1789 maxiosize = 4096; 1790 #endif 1791 if (bp->b_bcount <= maxiosize || 1792 (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) { 1793 dev_dstrategy_chain(vp->v_rdev, bio); 1794 return (0); 1795 } 1796 1797 /* 1798 * Clone the buffer and set up an I/O chain to chunk up the I/O. 1799 */ 1800 nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO); 1801 initbufbio(nbp); 1802 buf_dep_init(nbp); 1803 BUF_LOCK(nbp, LK_EXCLUSIVE); 1804 BUF_KERNPROC(nbp); 1805 nbp->b_vp = vp; 1806 nbp->b_flags = B_PAGING | B_KVABIO | (bp->b_flags & B_BNOCLIP); 1807 nbp->b_cpumask = bp->b_cpumask; 1808 nbp->b_data = bp->b_data; 1809 nbp->b_bio1.bio_done = devfs_spec_strategy_done; 1810 nbp->b_bio1.bio_offset = bio->bio_offset; 1811 nbp->b_bio1.bio_caller_info1.ptr = bio; 1812 1813 /* 1814 * Start the first transfer 1815 */ 1816 if (vn_isdisk(vp, NULL)) 1817 chunksize = vp->v_rdev->si_bsize_phys; 1818 else 1819 chunksize = DEV_BSIZE; 1820 chunksize = rounddown(maxiosize, chunksize); 1821 #if SPEC_CHAIN_DEBUG & 1 1822 devfs_debug(DEVFS_DEBUG_DEBUG, 1823 "spec_strategy chained I/O chunksize=%d\n", 1824 chunksize); 1825 #endif 1826 nbp->b_cmd = bp->b_cmd; 1827 nbp->b_bcount = chunksize; 1828 nbp->b_bufsize = chunksize; /* used to detect a short I/O */ 1829 nbp->b_bio1.bio_caller_info2.index = chunksize; 1830 1831 #if SPEC_CHAIN_DEBUG & 1 1832 devfs_debug(DEVFS_DEBUG_DEBUG, 1833 "spec_strategy: chain %p offset %d/%d bcount %d\n", 1834 bp, 0, bp->b_bcount, nbp->b_bcount); 1835 #endif 1836 1837 dev_dstrategy(vp->v_rdev, &nbp->b_bio1); 1838 1839 if (DEVFS_NODE(vp)) { 1840 vfs_timestamp(&DEVFS_NODE(vp)->atime); 1841 vfs_timestamp(&DEVFS_NODE(vp)->mtime); 1842 } 1843 1844 return (0); 1845 } 1846 1847 /* 1848 * Chunked up transfer completion routine - chain transfers until done 1849 * 1850 * NOTE: MPSAFE callback. 1851 */ 1852 static 1853 void 1854 devfs_spec_strategy_done(struct bio *nbio) 1855 { 1856 struct buf *nbp = nbio->bio_buf; 1857 struct bio *bio = nbio->bio_caller_info1.ptr; /* original bio */ 1858 struct buf *bp = bio->bio_buf; /* original bp */ 1859 int chunksize = nbio->bio_caller_info2.index; /* chunking */ 1860 int boffset = nbp->b_data - bp->b_data; 1861 1862 if (nbp->b_flags & B_ERROR) { 1863 /* 1864 * An error terminates the chain, propogate the error back 1865 * to the original bp 1866 */ 1867 bp->b_flags |= B_ERROR; 1868 bp->b_error = nbp->b_error; 1869 bp->b_resid = bp->b_bcount - boffset + 1870 (nbp->b_bcount - nbp->b_resid); 1871 #if SPEC_CHAIN_DEBUG & 1 1872 devfs_debug(DEVFS_DEBUG_DEBUG, 1873 "spec_strategy: chain %p error %d bcount %d/%d\n", 1874 bp, bp->b_error, bp->b_bcount, 1875 bp->b_bcount - bp->b_resid); 1876 #endif 1877 } else if (nbp->b_resid) { 1878 /* 1879 * A short read or write terminates the chain 1880 */ 1881 bp->b_error = nbp->b_error; 1882 bp->b_resid = bp->b_bcount - boffset + 1883 (nbp->b_bcount - nbp->b_resid); 1884 #if SPEC_CHAIN_DEBUG & 1 1885 devfs_debug(DEVFS_DEBUG_DEBUG, 1886 "spec_strategy: chain %p short read(1) " 1887 "bcount %d/%d\n", 1888 bp, bp->b_bcount - bp->b_resid, bp->b_bcount); 1889 #endif 1890 } else if (nbp->b_bcount != nbp->b_bufsize) { 1891 /* 1892 * A short read or write can also occur by truncating b_bcount 1893 */ 1894 #if SPEC_CHAIN_DEBUG & 1 1895 devfs_debug(DEVFS_DEBUG_DEBUG, 1896 "spec_strategy: chain %p short read(2) " 1897 "bcount %d/%d\n", 1898 bp, nbp->b_bcount + boffset, bp->b_bcount); 1899 #endif 1900 bp->b_error = 0; 1901 bp->b_bcount = nbp->b_bcount + boffset; 1902 bp->b_resid = nbp->b_resid; 1903 } else if (nbp->b_bcount + boffset == bp->b_bcount) { 1904 /* 1905 * No more data terminates the chain 1906 */ 1907 #if SPEC_CHAIN_DEBUG & 1 1908 devfs_debug(DEVFS_DEBUG_DEBUG, 1909 "spec_strategy: chain %p finished bcount %d\n", 1910 bp, bp->b_bcount); 1911 #endif 1912 bp->b_error = 0; 1913 bp->b_resid = 0; 1914 } else { 1915 /* 1916 * Continue the chain 1917 */ 1918 boffset += nbp->b_bcount; 1919 nbp->b_data = bp->b_data + boffset; 1920 nbp->b_bcount = bp->b_bcount - boffset; 1921 if (nbp->b_bcount > chunksize) 1922 nbp->b_bcount = chunksize; 1923 nbp->b_bio1.bio_done = devfs_spec_strategy_done; 1924 nbp->b_bio1.bio_offset = bio->bio_offset + boffset; 1925 1926 #if SPEC_CHAIN_DEBUG & 1 1927 devfs_debug(DEVFS_DEBUG_DEBUG, 1928 "spec_strategy: chain %p offset %d/%d bcount %d\n", 1929 bp, boffset, bp->b_bcount, nbp->b_bcount); 1930 #endif 1931 1932 dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1); 1933 return; 1934 } 1935 1936 /* 1937 * Fall through to here on termination. biodone(bp) and 1938 * clean up and free nbp. 1939 */ 1940 biodone(bio); 1941 BUF_UNLOCK(nbp); 1942 uninitbufbio(nbp); 1943 kfree(nbp, M_DEVBUF); 1944 } 1945 1946 /* 1947 * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length) 1948 */ 1949 static int 1950 devfs_spec_freeblks(struct vop_freeblks_args *ap) 1951 { 1952 struct buf *bp; 1953 1954 /* 1955 * Must be a synchronous operation 1956 */ 1957 KKASSERT(ap->a_vp->v_rdev != NULL); 1958 if ((ap->a_vp->v_rdev->si_flags & SI_CANFREE) == 0) 1959 return (0); 1960 bp = getpbuf(NULL); 1961 bp->b_cmd = BUF_CMD_FREEBLKS; 1962 bp->b_bio1.bio_flags |= BIO_SYNC; 1963 bp->b_bio1.bio_offset = ap->a_offset; 1964 bp->b_bio1.bio_done = biodone_sync; 1965 bp->b_bcount = ap->a_length; 1966 dev_dstrategy(ap->a_vp->v_rdev, &bp->b_bio1); 1967 biowait(&bp->b_bio1, "TRIM"); 1968 relpbuf(bp, NULL); 1969 1970 return (0); 1971 } 1972 1973 /* 1974 * Implement degenerate case where the block requested is the block 1975 * returned, and assume that the entire device is contiguous in regards 1976 * to the contiguous block range (runp and runb). 1977 * 1978 * spec_bmap(struct vnode *a_vp, off_t a_loffset, 1979 * off_t *a_doffsetp, int *a_runp, int *a_runb) 1980 */ 1981 static int 1982 devfs_spec_bmap(struct vop_bmap_args *ap) 1983 { 1984 if (ap->a_doffsetp != NULL) 1985 *ap->a_doffsetp = ap->a_loffset; 1986 if (ap->a_runp != NULL) 1987 *ap->a_runp = MAXBSIZE; 1988 if (ap->a_runb != NULL) { 1989 if (ap->a_loffset < MAXBSIZE) 1990 *ap->a_runb = (int)ap->a_loffset; 1991 else 1992 *ap->a_runb = MAXBSIZE; 1993 } 1994 return (0); 1995 } 1996 1997 1998 /* 1999 * Special device advisory byte-level locks. 2000 * 2001 * spec_advlock(struct vnode *a_vp, caddr_t a_id, int a_op, 2002 * struct flock *a_fl, int a_flags) 2003 */ 2004 /* ARGSUSED */ 2005 static int 2006 devfs_spec_advlock(struct vop_advlock_args *ap) 2007 { 2008 return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP); 2009 } 2010 2011 /* 2012 * NOTE: MPSAFE callback. 2013 */ 2014 static void 2015 devfs_spec_getpages_iodone(struct bio *bio) 2016 { 2017 bio->bio_buf->b_cmd = BUF_CMD_DONE; 2018 wakeup(bio->bio_buf); 2019 } 2020 2021 /* 2022 * spec_getpages() - get pages associated with device vnode. 2023 * 2024 * Note that spec_read and spec_write do not use the buffer cache, so we 2025 * must fully implement getpages here. 2026 */ 2027 static int 2028 devfs_spec_getpages(struct vop_getpages_args *ap) 2029 { 2030 vm_offset_t kva; 2031 int error; 2032 int i, pcount, size; 2033 struct buf *bp; 2034 vm_page_t m; 2035 vm_ooffset_t offset; 2036 int toff, nextoff, nread; 2037 struct vnode *vp = ap->a_vp; 2038 int blksiz; 2039 int gotreqpage; 2040 2041 error = 0; 2042 pcount = round_page(ap->a_count) / PAGE_SIZE; 2043 2044 /* 2045 * Calculate the offset of the transfer and do sanity check. 2046 */ 2047 offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; 2048 2049 /* 2050 * Round up physical size for real devices. We cannot round using 2051 * v_mount's block size data because v_mount has nothing to do with 2052 * the device. i.e. it's usually '/dev'. We need the physical block 2053 * size for the device itself. 2054 * 2055 * We can't use v_rdev->si_mountpoint because it only exists when the 2056 * block device is mounted. However, we can use v_rdev. 2057 */ 2058 if (vn_isdisk(vp, NULL)) 2059 blksiz = vp->v_rdev->si_bsize_phys; 2060 else 2061 blksiz = DEV_BSIZE; 2062 2063 size = roundup2(ap->a_count, blksiz); 2064 2065 bp = getpbuf_kva(NULL); 2066 kva = (vm_offset_t)bp->b_data; 2067 2068 /* 2069 * Map the pages to be read into the kva. 2070 */ 2071 pmap_qenter_noinval(kva, ap->a_m, pcount); 2072 2073 /* Build a minimal buffer header. */ 2074 bp->b_cmd = BUF_CMD_READ; 2075 bp->b_flags |= B_KVABIO; 2076 bp->b_bcount = size; 2077 bp->b_resid = 0; 2078 bsetrunningbufspace(bp, size); 2079 2080 bp->b_bio1.bio_offset = offset; 2081 bp->b_bio1.bio_done = devfs_spec_getpages_iodone; 2082 2083 mycpu->gd_cnt.v_vnodein++; 2084 mycpu->gd_cnt.v_vnodepgsin += pcount; 2085 2086 /* Do the input. */ 2087 vn_strategy(ap->a_vp, &bp->b_bio1); 2088 2089 crit_enter(); 2090 2091 /* We definitely need to be at splbio here. */ 2092 while (bp->b_cmd != BUF_CMD_DONE) 2093 tsleep(bp, 0, "spread", 0); 2094 2095 crit_exit(); 2096 2097 if (bp->b_flags & B_ERROR) { 2098 if (bp->b_error) 2099 error = bp->b_error; 2100 else 2101 error = EIO; 2102 } 2103 2104 /* 2105 * If EOF is encountered we must zero-extend the result in order 2106 * to ensure that the page does not contain garabge. When no 2107 * error occurs, an early EOF is indicated if b_bcount got truncated. 2108 * b_resid is relative to b_bcount and should be 0, but some devices 2109 * might indicate an EOF with b_resid instead of truncating b_bcount. 2110 */ 2111 nread = bp->b_bcount - bp->b_resid; 2112 if (nread < ap->a_count) { 2113 bkvasync(bp); 2114 bzero((caddr_t)kva + nread, ap->a_count - nread); 2115 } 2116 pmap_qremove_noinval(kva, pcount); 2117 2118 gotreqpage = 0; 2119 for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { 2120 nextoff = toff + PAGE_SIZE; 2121 m = ap->a_m[i]; 2122 2123 /* 2124 * NOTE: vm_page_undirty/clear_dirty etc do not clear the 2125 * pmap modified bit. pmap modified bit should have 2126 * already been cleared. 2127 */ 2128 if (nextoff <= nread) { 2129 m->valid = VM_PAGE_BITS_ALL; 2130 vm_page_undirty(m); 2131 } else if (toff < nread) { 2132 /* 2133 * Since this is a VM request, we have to supply the 2134 * unaligned offset to allow vm_page_set_valid() 2135 * to zero sub-DEV_BSIZE'd portions of the page. 2136 */ 2137 vm_page_set_valid(m, 0, nread - toff); 2138 vm_page_clear_dirty_end_nonincl(m, 0, nread - toff); 2139 } else { 2140 m->valid = 0; 2141 vm_page_undirty(m); 2142 } 2143 2144 if (i != ap->a_reqpage) { 2145 /* 2146 * Just in case someone was asking for this page we 2147 * now tell them that it is ok to use. 2148 */ 2149 if (!error || (m->valid == VM_PAGE_BITS_ALL)) { 2150 if (m->valid) { 2151 if (m->flags & PG_REFERENCED) { 2152 vm_page_activate(m); 2153 } else { 2154 vm_page_deactivate(m); 2155 } 2156 vm_page_wakeup(m); 2157 } else { 2158 vm_page_free(m); 2159 } 2160 } else { 2161 vm_page_free(m); 2162 } 2163 } else if (m->valid) { 2164 gotreqpage = 1; 2165 /* 2166 * Since this is a VM request, we need to make the 2167 * entire page presentable by zeroing invalid sections. 2168 */ 2169 if (m->valid != VM_PAGE_BITS_ALL) 2170 vm_page_zero_invalid(m, FALSE); 2171 } 2172 } 2173 if (!gotreqpage) { 2174 m = ap->a_m[ap->a_reqpage]; 2175 devfs_debug(DEVFS_DEBUG_WARNING, 2176 "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n", 2177 devtoname(vp->v_rdev), error, bp, bp->b_vp); 2178 devfs_debug(DEVFS_DEBUG_WARNING, 2179 " size: %d, resid: %d, a_count: %d, valid: 0x%x\n", 2180 size, bp->b_resid, ap->a_count, m->valid); 2181 devfs_debug(DEVFS_DEBUG_WARNING, 2182 " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", 2183 nread, ap->a_reqpage, (u_long)m->pindex, pcount); 2184 /* 2185 * Free the buffer header back to the swap buffer pool. 2186 */ 2187 relpbuf(bp, NULL); 2188 return VM_PAGER_ERROR; 2189 } 2190 /* 2191 * Free the buffer header back to the swap buffer pool. 2192 */ 2193 relpbuf(bp, NULL); 2194 if (DEVFS_NODE(ap->a_vp)) 2195 vfs_timestamp(&DEVFS_NODE(ap->a_vp)->mtime); 2196 return VM_PAGER_OK; 2197 } 2198 2199 static __inline 2200 int 2201 sequential_heuristic(struct uio *uio, struct file *fp) 2202 { 2203 /* 2204 * Sequential heuristic - detect sequential operation 2205 */ 2206 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 2207 uio->uio_offset == fp->f_nextoff) { 2208 /* 2209 * XXX we assume that the filesystem block size is 2210 * the default. Not true, but still gives us a pretty 2211 * good indicator of how sequential the read operations 2212 * are. 2213 */ 2214 int tmpseq = fp->f_seqcount; 2215 2216 tmpseq += (uio->uio_resid + MAXBSIZE - 1) / MAXBSIZE; 2217 if (tmpseq > IO_SEQMAX) 2218 tmpseq = IO_SEQMAX; 2219 fp->f_seqcount = tmpseq; 2220 return(fp->f_seqcount << IO_SEQSHIFT); 2221 } 2222 2223 /* 2224 * Not sequential, quick draw-down of seqcount 2225 */ 2226 if (fp->f_seqcount > 1) 2227 fp->f_seqcount = 1; 2228 else 2229 fp->f_seqcount = 0; 2230 return(0); 2231 } 2232