xref: /netbsd-src/sys/miscfs/specfs/spec_vnops.c (revision 6deb2c22d20de1d75d538e8a5c57b573926fd157)
1 /*	$NetBSD: spec_vnops.c,v 1.126 2009/10/06 04:28:10 elad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. Neither the name of the University nor the names of its contributors
42  *    may be used to endorse or promote products derived from this software
43  *    without specific prior written permission.
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55  * SUCH DAMAGE.
56  *
57  *	@(#)spec_vnops.c	8.15 (Berkeley) 7/14/95
58  */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.126 2009/10/06 04:28:10 elad Exp $");
62 
63 #include <sys/param.h>
64 #include <sys/proc.h>
65 #include <sys/systm.h>
66 #include <sys/kernel.h>
67 #include <sys/conf.h>
68 #include <sys/buf.h>
69 #include <sys/mount.h>
70 #include <sys/namei.h>
71 #include <sys/vnode.h>
72 #include <sys/stat.h>
73 #include <sys/errno.h>
74 #include <sys/ioctl.h>
75 #include <sys/poll.h>
76 #include <sys/file.h>
77 #include <sys/disklabel.h>
78 #include <sys/lockf.h>
79 #include <sys/tty.h>
80 #include <sys/kauth.h>
81 #include <sys/fstrans.h>
82 #include <sys/module.h>
83 
84 #include <miscfs/genfs/genfs.h>
85 #include <miscfs/specfs/specdev.h>
86 
87 /* symbolic sleep message strings for devices */
88 const char	devopn[] = "devopn";
89 const char	devio[] = "devio";
90 const char	devwait[] = "devwait";
91 const char	devin[] = "devin";
92 const char	devout[] = "devout";
93 const char	devioc[] = "devioc";
94 const char	devcls[] = "devcls";
95 
96 vnode_t		*specfs_hash[SPECHSZ];
97 
98 /*
99  * This vnode operations vector is used for special device nodes
100  * created from whole cloth by the kernel.  For the ops vector for
101  * vnodes built from special devices found in a filesystem, see (e.g)
102  * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other
103  * filesystems.
104  */
105 
106 int (**spec_vnodeop_p)(void *);
107 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
108 	{ &vop_default_desc, vn_default_error },
109 	{ &vop_lookup_desc, spec_lookup },		/* lookup */
110 	{ &vop_create_desc, spec_create },		/* create */
111 	{ &vop_mknod_desc, spec_mknod },		/* mknod */
112 	{ &vop_open_desc, spec_open },			/* open */
113 	{ &vop_close_desc, spec_close },		/* close */
114 	{ &vop_access_desc, spec_access },		/* access */
115 	{ &vop_getattr_desc, spec_getattr },		/* getattr */
116 	{ &vop_setattr_desc, spec_setattr },		/* setattr */
117 	{ &vop_read_desc, spec_read },			/* read */
118 	{ &vop_write_desc, spec_write },		/* write */
119 	{ &vop_fcntl_desc, spec_fcntl },		/* fcntl */
120 	{ &vop_ioctl_desc, spec_ioctl },		/* ioctl */
121 	{ &vop_poll_desc, spec_poll },			/* poll */
122 	{ &vop_kqfilter_desc, spec_kqfilter },		/* kqfilter */
123 	{ &vop_revoke_desc, spec_revoke },		/* revoke */
124 	{ &vop_mmap_desc, spec_mmap },			/* mmap */
125 	{ &vop_fsync_desc, spec_fsync },		/* fsync */
126 	{ &vop_seek_desc, spec_seek },			/* seek */
127 	{ &vop_remove_desc, spec_remove },		/* remove */
128 	{ &vop_link_desc, spec_link },			/* link */
129 	{ &vop_rename_desc, spec_rename },		/* rename */
130 	{ &vop_mkdir_desc, spec_mkdir },		/* mkdir */
131 	{ &vop_rmdir_desc, spec_rmdir },		/* rmdir */
132 	{ &vop_symlink_desc, spec_symlink },		/* symlink */
133 	{ &vop_readdir_desc, spec_readdir },		/* readdir */
134 	{ &vop_readlink_desc, spec_readlink },		/* readlink */
135 	{ &vop_abortop_desc, spec_abortop },		/* abortop */
136 	{ &vop_inactive_desc, spec_inactive },		/* inactive */
137 	{ &vop_reclaim_desc, spec_reclaim },		/* reclaim */
138 	{ &vop_lock_desc, spec_lock },			/* lock */
139 	{ &vop_unlock_desc, spec_unlock },		/* unlock */
140 	{ &vop_bmap_desc, spec_bmap },			/* bmap */
141 	{ &vop_strategy_desc, spec_strategy },		/* strategy */
142 	{ &vop_print_desc, spec_print },		/* print */
143 	{ &vop_islocked_desc, spec_islocked },		/* islocked */
144 	{ &vop_pathconf_desc, spec_pathconf },		/* pathconf */
145 	{ &vop_advlock_desc, spec_advlock },		/* advlock */
146 	{ &vop_bwrite_desc, spec_bwrite },		/* bwrite */
147 	{ &vop_getpages_desc, spec_getpages },		/* getpages */
148 	{ &vop_putpages_desc, spec_putpages },		/* putpages */
149 	{ NULL, NULL }
150 };
151 const struct vnodeopv_desc spec_vnodeop_opv_desc =
152 	{ &spec_vnodeop_p, spec_vnodeop_entries };
153 
154 /* Returns true if vnode is /dev/mem or /dev/kmem. */
155 bool
156 iskmemvp(struct vnode *vp)
157 {
158 	return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev));
159 }
160 
161 /*
162  * Returns true if dev is /dev/mem or /dev/kmem.
163  */
164 int
165 iskmemdev(dev_t dev)
166 {
167 	/* mem_no is emitted by config(8) to generated devsw.c */
168 	extern const int mem_no;
169 
170 	/* minor 14 is /dev/io on i386 with COMPAT_10 */
171 	return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14));
172 }
173 
174 /*
175  * Initialize a vnode that represents a device.
176  */
177 void
178 spec_node_init(vnode_t *vp, dev_t rdev)
179 {
180 	specnode_t *sn;
181 	specdev_t *sd;
182 	vnode_t *vp2;
183 	vnode_t **vpp;
184 
185 	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
186 	KASSERT(vp->v_specnode == NULL);
187 
188 	/*
189 	 * Search the hash table for this device.  If known, add a
190 	 * reference to the device structure.  If not known, create
191 	 * a new entry to represent the device.  In all cases add
192 	 * the vnode to the hash table.
193 	 */
194 	sn = kmem_alloc(sizeof(*sn), KM_SLEEP);
195 	if (sn == NULL) {
196 		/* XXX */
197 		panic("spec_node_init: unable to allocate memory");
198 	}
199 	sd = kmem_alloc(sizeof(*sd), KM_SLEEP);
200 	if (sd == NULL) {
201 		/* XXX */
202 		panic("spec_node_init: unable to allocate memory");
203 	}
204 	mutex_enter(&device_lock);
205 	vpp = &specfs_hash[SPECHASH(rdev)];
206 	for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) {
207 		KASSERT(vp2->v_specnode != NULL);
208 		if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) {
209 			break;
210 		}
211 	}
212 	if (vp2 == NULL) {
213 		/* No existing record, create a new one. */
214 		sd->sd_rdev = rdev;
215 		sd->sd_mountpoint = NULL;
216 		sd->sd_lockf = NULL;
217 		sd->sd_refcnt = 1;
218 		sd->sd_opencnt = 0;
219 		sd->sd_bdevvp = NULL;
220 		sn->sn_dev = sd;
221 		sd = NULL;
222 	} else {
223 		/* Use the existing record. */
224 		sn->sn_dev = vp2->v_specnode->sn_dev;
225 		sn->sn_dev->sd_refcnt++;
226 	}
227 	/* Insert vnode into the hash chain. */
228 	sn->sn_opencnt = 0;
229 	sn->sn_rdev = rdev;
230 	sn->sn_gone = false;
231 	vp->v_specnode = sn;
232 	vp->v_specnext = *vpp;
233 	*vpp = vp;
234 	mutex_exit(&device_lock);
235 
236 	/* Free the record we allocated if unused. */
237 	if (sd != NULL) {
238 		kmem_free(sd, sizeof(*sd));
239 	}
240 }
241 
242 /*
243  * A vnode representing a special device is going away.  Close
244  * the device if the vnode holds it open.
245  */
246 void
247 spec_node_revoke(vnode_t *vp)
248 {
249 	specnode_t *sn;
250 	specdev_t *sd;
251 
252 	sn = vp->v_specnode;
253 	sd = sn->sn_dev;
254 
255 	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
256 	KASSERT(vp->v_specnode != NULL);
257 	KASSERT((vp->v_iflag & VI_XLOCK) != 0);
258 	KASSERT(sn->sn_gone == false);
259 
260 	mutex_enter(&device_lock);
261 	KASSERT(sn->sn_opencnt <= sd->sd_opencnt);
262 	if (sn->sn_opencnt != 0) {
263 		sd->sd_opencnt -= (sn->sn_opencnt - 1);
264 		sn->sn_opencnt = 1;
265 		sn->sn_gone = true;
266 		mutex_exit(&device_lock);
267 
268 		VOP_CLOSE(vp, FNONBLOCK, NOCRED);
269 
270 		mutex_enter(&device_lock);
271 		KASSERT(sn->sn_opencnt == 0);
272 	}
273 	mutex_exit(&device_lock);
274 }
275 
276 /*
277  * A vnode representing a special device is being recycled.
278  * Destroy the specfs component.
279  */
280 void
281 spec_node_destroy(vnode_t *vp)
282 {
283 	specnode_t *sn;
284 	specdev_t *sd;
285 	vnode_t **vpp, *vp2;
286 	int refcnt;
287 
288 	sn = vp->v_specnode;
289 	sd = sn->sn_dev;
290 
291 	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
292 	KASSERT(vp->v_specnode != NULL);
293 	KASSERT(sn->sn_opencnt == 0);
294 
295 	mutex_enter(&device_lock);
296 	/* Remove from the hash and destroy the node. */
297 	vpp = &specfs_hash[SPECHASH(vp->v_rdev)];
298 	for (vp2 = *vpp;; vp2 = vp2->v_specnext) {
299 		if (vp2 == NULL) {
300 			panic("spec_node_destroy: corrupt hash");
301 		}
302 		if (vp2 == vp) {
303 			KASSERT(vp == *vpp);
304 			*vpp = vp->v_specnext;
305 			break;
306 		}
307 		if (vp2->v_specnext == vp) {
308 			vp2->v_specnext = vp->v_specnext;
309 			break;
310 		}
311 	}
312 	sn = vp->v_specnode;
313 	vp->v_specnode = NULL;
314 	refcnt = sd->sd_refcnt--;
315 	KASSERT(refcnt > 0);
316 	mutex_exit(&device_lock);
317 
318 	/* If the device is no longer in use, destroy our record. */
319 	if (refcnt == 1) {
320 		KASSERT(sd->sd_opencnt == 0);
321 		KASSERT(sd->sd_bdevvp == NULL);
322 		kmem_free(sd, sizeof(*sd));
323 	}
324 	kmem_free(sn, sizeof(*sn));
325 }
326 
327 /*
328  * Trivial lookup routine that always fails.
329  */
330 int
331 spec_lookup(void *v)
332 {
333 	struct vop_lookup_args /* {
334 		struct vnode *a_dvp;
335 		struct vnode **a_vpp;
336 		struct componentname *a_cnp;
337 	} */ *ap = v;
338 
339 	*ap->a_vpp = NULL;
340 	return (ENOTDIR);
341 }
342 
343 /*
344  * Open a special file.
345  */
346 /* ARGSUSED */
347 int
348 spec_open(void *v)
349 {
350 	struct vop_open_args /* {
351 		struct vnode *a_vp;
352 		int  a_mode;
353 		kauth_cred_t a_cred;
354 	} */ *ap = v;
355 	struct lwp *l;
356 	struct vnode *vp;
357 	dev_t dev;
358 	int error;
359 	struct partinfo pi;
360 	enum kauth_device_req req;
361 	specnode_t *sn;
362 	specdev_t *sd;
363 
364 	u_int gen;
365 	const char *name;
366 
367 	l = curlwp;
368 	vp = ap->a_vp;
369 	dev = vp->v_rdev;
370 	sn = vp->v_specnode;
371 	sd = sn->sn_dev;
372 	name = NULL;
373 	gen = 0;
374 
375 	/*
376 	 * Don't allow open if fs is mounted -nodev.
377 	 */
378 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
379 		return (ENXIO);
380 
381 	switch (ap->a_mode & (FREAD | FWRITE)) {
382 	case FREAD | FWRITE:
383 		req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW;
384 		break;
385 	case FWRITE:
386 		req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE;
387 		break;
388 	default:
389 		req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
390 		break;
391 	}
392 
393 	switch (vp->v_type) {
394 	case VCHR:
395 		error = kauth_authorize_device_spec(ap->a_cred, req, vp);
396 		if (error != 0)
397 			return (error);
398 
399 		/*
400 		 * Character devices can accept opens from multiple
401 		 * vnodes.
402 		 */
403 		mutex_enter(&device_lock);
404 		if (sn->sn_gone) {
405 			mutex_exit(&device_lock);
406 			return (EBADF);
407 		}
408 		sd->sd_opencnt++;
409 		sn->sn_opencnt++;
410 		mutex_exit(&device_lock);
411 		if (cdev_type(dev) == D_TTY)
412 			vp->v_vflag |= VV_ISTTY;
413 		VOP_UNLOCK(vp, 0);
414 		do {
415 			const struct cdevsw *cdev;
416 
417 			gen = module_gen;
418 			error = cdev_open(dev, ap->a_mode, S_IFCHR, l);
419 			if (error != ENXIO)
420 				break;
421 
422 			/* Check if we already have a valid driver */
423 			mutex_enter(&device_lock);
424 			cdev = cdevsw_lookup(dev);
425 			mutex_exit(&device_lock);
426 			if (cdev != NULL)
427 				break;
428 
429 			/* Get device name from devsw_conv array */
430 			if ((name = cdevsw_getname(major(dev))) == NULL)
431 				break;
432 
433 			/* Try to autoload device module */
434 			mutex_enter(&module_lock);
435 			(void) module_autoload(name, MODULE_CLASS_DRIVER);
436 			mutex_exit(&module_lock);
437 		} while (gen != module_gen);
438 
439 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
440 		break;
441 
442 	case VBLK:
443 		error = kauth_authorize_device_spec(ap->a_cred, req, vp);
444 		if (error != 0)
445 			return (error);
446 
447 		/*
448 		 * For block devices, permit only one open.  The buffer
449 		 * cache cannot remain self-consistent with multiple
450 		 * vnodes holding a block device open.
451 		 */
452 		mutex_enter(&device_lock);
453 		if (sn->sn_gone) {
454 			mutex_exit(&device_lock);
455 			return (EBADF);
456 		}
457 		if (sd->sd_opencnt != 0) {
458 			mutex_exit(&device_lock);
459 			return EBUSY;
460 		}
461 		sn->sn_opencnt = 1;
462 		sd->sd_opencnt = 1;
463 		sd->sd_bdevvp = vp;
464 		mutex_exit(&device_lock);
465 		do {
466 			const struct bdevsw *bdev;
467 
468 			gen = module_gen;
469 			error = bdev_open(dev, ap->a_mode, S_IFBLK, l);
470 			if (error != ENXIO)
471 				break;
472 
473 			/* Check if we already have a valid driver */
474 			mutex_enter(&device_lock);
475 			bdev = bdevsw_lookup(dev);
476 			mutex_exit(&device_lock);
477 			if (bdev != NULL)
478 				break;
479 
480 			/* Get device name from devsw_conv array */
481 			if ((name = bdevsw_getname(major(dev))) == NULL)
482 				break;
483 
484 			VOP_UNLOCK(vp, 0);
485 
486                         /* Try to autoload device module */
487 			mutex_enter(&module_lock);
488 			(void) module_autoload(name, MODULE_CLASS_DRIVER);
489 			mutex_exit(&module_lock);
490 
491 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
492 		} while (gen != module_gen);
493 
494 		break;
495 
496 	case VNON:
497 	case VLNK:
498 	case VDIR:
499 	case VREG:
500 	case VBAD:
501 	case VFIFO:
502 	case VSOCK:
503 	default:
504 		return 0;
505 	}
506 
507 	mutex_enter(&device_lock);
508 	if (sn->sn_gone) {
509 		if (error == 0)
510 			error = EBADF;
511 	} else if (error != 0) {
512 		sd->sd_opencnt--;
513 		sn->sn_opencnt--;
514 		if (vp->v_type == VBLK)
515 			sd->sd_bdevvp = NULL;
516 
517 	}
518 	mutex_exit(&device_lock);
519 
520 	if (cdev_type(dev) != D_DISK || error != 0)
521 		return error;
522 
523 	if (vp->v_type == VCHR)
524 		error = cdev_ioctl(vp->v_rdev, DIOCGPART, &pi, FREAD, curlwp);
525 	else
526 		error = bdev_ioctl(vp->v_rdev, DIOCGPART, &pi, FREAD, curlwp);
527 	if (error == 0)
528 		uvm_vnp_setsize(vp,
529 		    (voff_t)pi.disklab->d_secsize * pi.part->p_size);
530 	return 0;
531 }
532 
533 /*
534  * Vnode op for read
535  */
536 /* ARGSUSED */
537 int
538 spec_read(void *v)
539 {
540 	struct vop_read_args /* {
541 		struct vnode *a_vp;
542 		struct uio *a_uio;
543 		int  a_ioflag;
544 		kauth_cred_t a_cred;
545 	} */ *ap = v;
546 	struct vnode *vp = ap->a_vp;
547 	struct uio *uio = ap->a_uio;
548  	struct lwp *l = curlwp;
549 	struct buf *bp;
550 	daddr_t bn;
551 	int bsize, bscale;
552 	struct partinfo dpart;
553 	int n, on;
554 	int error = 0;
555 
556 #ifdef DIAGNOSTIC
557 	if (uio->uio_rw != UIO_READ)
558 		panic("spec_read mode");
559 	if (&uio->uio_vmspace->vm_map != kernel_map &&
560 	    uio->uio_vmspace != curproc->p_vmspace)
561 		panic("spec_read proc");
562 #endif
563 	if (uio->uio_resid == 0)
564 		return (0);
565 
566 	switch (vp->v_type) {
567 
568 	case VCHR:
569 		VOP_UNLOCK(vp, 0);
570 		error = cdev_read(vp->v_rdev, uio, ap->a_ioflag);
571 		vn_lock(vp, LK_SHARED | LK_RETRY);
572 		return (error);
573 
574 	case VBLK:
575 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
576 		if (uio->uio_offset < 0)
577 			return (EINVAL);
578 		bsize = BLKDEV_IOSIZE;
579 		if (bdev_ioctl(vp->v_rdev, DIOCGPART, &dpart, FREAD, l) == 0) {
580 			if (dpart.part->p_fstype == FS_BSDFFS &&
581 			    dpart.part->p_frag != 0 && dpart.part->p_fsize != 0)
582 				bsize = dpart.part->p_frag *
583 				    dpart.part->p_fsize;
584 		}
585 		bscale = bsize >> DEV_BSHIFT;
586 		do {
587 			bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
588 			on = uio->uio_offset % bsize;
589 			n = min((unsigned)(bsize - on), uio->uio_resid);
590 			error = bread(vp, bn, bsize, NOCRED, 0, &bp);
591 			n = min(n, bsize - bp->b_resid);
592 			if (error) {
593 				brelse(bp, 0);
594 				return (error);
595 			}
596 			error = uiomove((char *)bp->b_data + on, n, uio);
597 			brelse(bp, 0);
598 		} while (error == 0 && uio->uio_resid > 0 && n != 0);
599 		return (error);
600 
601 	default:
602 		panic("spec_read type");
603 	}
604 	/* NOTREACHED */
605 }
606 
607 /*
608  * Vnode op for write
609  */
610 /* ARGSUSED */
611 int
612 spec_write(void *v)
613 {
614 	struct vop_write_args /* {
615 		struct vnode *a_vp;
616 		struct uio *a_uio;
617 		int  a_ioflag;
618 		kauth_cred_t a_cred;
619 	} */ *ap = v;
620 	struct vnode *vp = ap->a_vp;
621 	struct uio *uio = ap->a_uio;
622 	struct lwp *l = curlwp;
623 	struct buf *bp;
624 	daddr_t bn;
625 	int bsize, bscale;
626 	struct partinfo dpart;
627 	int n, on;
628 	int error = 0;
629 
630 #ifdef DIAGNOSTIC
631 	if (uio->uio_rw != UIO_WRITE)
632 		panic("spec_write mode");
633 	if (&uio->uio_vmspace->vm_map != kernel_map &&
634 	    uio->uio_vmspace != curproc->p_vmspace)
635 		panic("spec_write proc");
636 #endif
637 
638 	switch (vp->v_type) {
639 
640 	case VCHR:
641 		VOP_UNLOCK(vp, 0);
642 		error = cdev_write(vp->v_rdev, uio, ap->a_ioflag);
643 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
644 		return (error);
645 
646 	case VBLK:
647 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
648 		if (uio->uio_resid == 0)
649 			return (0);
650 		if (uio->uio_offset < 0)
651 			return (EINVAL);
652 		bsize = BLKDEV_IOSIZE;
653 		if (bdev_ioctl(vp->v_rdev, DIOCGPART, &dpart, FREAD, l) == 0) {
654 			if (dpart.part->p_fstype == FS_BSDFFS &&
655 			    dpart.part->p_frag != 0 && dpart.part->p_fsize != 0)
656 				bsize = dpart.part->p_frag *
657 				    dpart.part->p_fsize;
658 		}
659 		bscale = bsize >> DEV_BSHIFT;
660 		do {
661 			bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
662 			on = uio->uio_offset % bsize;
663 			n = min((unsigned)(bsize - on), uio->uio_resid);
664 			if (n == bsize)
665 				bp = getblk(vp, bn, bsize, 0, 0);
666 			else
667 				error = bread(vp, bn, bsize, NOCRED,
668 				    B_MODIFY, &bp);
669 			if (error) {
670 				brelse(bp, 0);
671 				return (error);
672 			}
673 			n = min(n, bsize - bp->b_resid);
674 			error = uiomove((char *)bp->b_data + on, n, uio);
675 			if (error)
676 				brelse(bp, 0);
677 			else {
678 				if (n + on == bsize)
679 					bawrite(bp);
680 				else
681 					bdwrite(bp);
682 				error = bp->b_error;
683 			}
684 		} while (error == 0 && uio->uio_resid > 0 && n != 0);
685 		return (error);
686 
687 	default:
688 		panic("spec_write type");
689 	}
690 	/* NOTREACHED */
691 }
692 
693 /*
694  * Device ioctl operation.
695  */
696 /* ARGSUSED */
697 int
698 spec_ioctl(void *v)
699 {
700 	struct vop_ioctl_args /* {
701 		struct vnode *a_vp;
702 		u_long a_command;
703 		void  *a_data;
704 		int  a_fflag;
705 		kauth_cred_t a_cred;
706 	} */ *ap = v;
707 	struct vnode *vp;
708 	dev_t dev;
709 
710 	/*
711 	 * Extract all the info we need from the vnode, taking care to
712 	 * avoid a race with VOP_REVOKE().
713 	 */
714 
715 	vp = ap->a_vp;
716 	dev = NODEV;
717 	mutex_enter(&vp->v_interlock);
718 	if ((vp->v_iflag & VI_XLOCK) == 0 && vp->v_specnode) {
719 		dev = vp->v_rdev;
720 	}
721 	mutex_exit(&vp->v_interlock);
722 	if (dev == NODEV) {
723 		return ENXIO;
724 	}
725 
726 	switch (vp->v_type) {
727 
728 	case VCHR:
729 		return cdev_ioctl(dev, ap->a_command, ap->a_data,
730 		    ap->a_fflag, curlwp);
731 
732 	case VBLK:
733 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
734 		return bdev_ioctl(dev, ap->a_command, ap->a_data,
735 		   ap->a_fflag, curlwp);
736 
737 	default:
738 		panic("spec_ioctl");
739 		/* NOTREACHED */
740 	}
741 }
742 
743 /* ARGSUSED */
744 int
745 spec_poll(void *v)
746 {
747 	struct vop_poll_args /* {
748 		struct vnode *a_vp;
749 		int a_events;
750 	} */ *ap = v;
751 	struct vnode *vp;
752 	dev_t dev;
753 
754 	/*
755 	 * Extract all the info we need from the vnode, taking care to
756 	 * avoid a race with VOP_REVOKE().
757 	 */
758 
759 	vp = ap->a_vp;
760 	dev = NODEV;
761 	mutex_enter(&vp->v_interlock);
762 	if ((vp->v_iflag & VI_XLOCK) == 0 && vp->v_specnode) {
763 		dev = vp->v_rdev;
764 	}
765 	mutex_exit(&vp->v_interlock);
766 	if (dev == NODEV) {
767 		return POLLERR;
768 	}
769 
770 	switch (vp->v_type) {
771 
772 	case VCHR:
773 		return cdev_poll(dev, ap->a_events, curlwp);
774 
775 	default:
776 		return (genfs_poll(v));
777 	}
778 }
779 
780 /* ARGSUSED */
781 int
782 spec_kqfilter(void *v)
783 {
784 	struct vop_kqfilter_args /* {
785 		struct vnode	*a_vp;
786 		struct proc	*a_kn;
787 	} */ *ap = v;
788 	dev_t dev;
789 
790 	switch (ap->a_vp->v_type) {
791 
792 	case VCHR:
793 		dev = ap->a_vp->v_rdev;
794 		return cdev_kqfilter(dev, ap->a_kn);
795 	default:
796 		/*
797 		 * Block devices don't support kqfilter, and refuse it
798 		 * for any other files (like those vflush()ed) too.
799 		 */
800 		return (EOPNOTSUPP);
801 	}
802 }
803 
804 /*
805  * Allow mapping of only D_DISK.  This is called only for VBLK.
806  */
807 int
808 spec_mmap(void *v)
809 {
810 	struct vop_mmap_args /* {
811 		struct vnode *a_vp;
812 		vm_prot_t a_prot;
813 		kauth_cred_t a_cred;
814 	} */ *ap = v;
815 	struct vnode *vp = ap->a_vp;
816 
817 	KASSERT(vp->v_type == VBLK);
818 	if (bdev_type(vp->v_rdev) != D_DISK)
819 		return EINVAL;
820 
821 	return 0;
822 }
823 
824 /*
825  * Synch buffers associated with a block device
826  */
827 /* ARGSUSED */
828 int
829 spec_fsync(void *v)
830 {
831 	struct vop_fsync_args /* {
832 		struct vnode *a_vp;
833 		kauth_cred_t a_cred;
834 		int  a_flags;
835 		off_t offlo;
836 		off_t offhi;
837 	} */ *ap = v;
838 	struct vnode *vp = ap->a_vp;
839 	struct mount *mp;
840 	int error;
841 
842 	if (vp->v_type == VBLK) {
843 		if ((mp = vp->v_specmountpoint) != NULL) {
844 			error = VFS_FSYNC(mp, vp, ap->a_flags | FSYNC_VFS);
845 			if (error != EOPNOTSUPP)
846 				return error;
847 		}
848 		vflushbuf(vp, (ap->a_flags & FSYNC_WAIT) != 0);
849 	}
850 	return (0);
851 }
852 
853 /*
854  * Just call the device strategy routine
855  */
856 int
857 spec_strategy(void *v)
858 {
859 	struct vop_strategy_args /* {
860 		struct vnode *a_vp;
861 		struct buf *a_bp;
862 	} */ *ap = v;
863 	struct vnode *vp = ap->a_vp;
864 	struct buf *bp = ap->a_bp;
865 	int error;
866 
867 	KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
868 
869 	error = 0;
870 	bp->b_dev = vp->v_rdev;
871 
872 	if (!(bp->b_flags & B_READ))
873 		error = fscow_run(bp, false);
874 
875 	if (error) {
876 		bp->b_error = error;
877 		biodone(bp);
878 		return (error);
879 	}
880 
881 	bdev_strategy(bp);
882 
883 	return (0);
884 }
885 
886 int
887 spec_inactive(void *v)
888 {
889 	struct vop_inactive_args /* {
890 		struct vnode *a_vp;
891 		struct proc *a_l;
892 	} */ *ap = v;
893 
894 	VOP_UNLOCK(ap->a_vp, 0);
895 	return (0);
896 }
897 
898 /*
899  * This is a noop, simply returning what one has been given.
900  */
901 int
902 spec_bmap(void *v)
903 {
904 	struct vop_bmap_args /* {
905 		struct vnode *a_vp;
906 		daddr_t  a_bn;
907 		struct vnode **a_vpp;
908 		daddr_t *a_bnp;
909 		int *a_runp;
910 	} */ *ap = v;
911 
912 	if (ap->a_vpp != NULL)
913 		*ap->a_vpp = ap->a_vp;
914 	if (ap->a_bnp != NULL)
915 		*ap->a_bnp = ap->a_bn;
916 	if (ap->a_runp != NULL)
917 		*ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1;
918 	return (0);
919 }
920 
921 /*
922  * Device close routine
923  */
924 /* ARGSUSED */
925 int
926 spec_close(void *v)
927 {
928 	struct vop_close_args /* {
929 		struct vnode *a_vp;
930 		int  a_fflag;
931 		kauth_cred_t a_cred;
932 	} */ *ap = v;
933 	struct vnode *vp = ap->a_vp;
934 	struct session *sess;
935 	dev_t dev = vp->v_rdev;
936 	int mode, error, flags, flags1, count;
937 	specnode_t *sn;
938 	specdev_t *sd;
939 
940 	flags = vp->v_iflag;
941 	sn = vp->v_specnode;
942 	sd = sn->sn_dev;
943 
944 	switch (vp->v_type) {
945 
946 	case VCHR:
947 		/*
948 		 * Hack: a tty device that is a controlling terminal
949 		 * has a reference from the session structure.  We
950 		 * cannot easily tell that a character device is a
951 		 * controlling terminal, unless it is the closing
952 		 * process' controlling terminal.  In that case, if the
953 		 * open count is 1 release the reference from the
954 		 * session.  Also, remove the link from the tty back to
955 		 * the session and pgrp.
956 		 *
957 		 * XXX V. fishy.
958 		 */
959 		mutex_enter(proc_lock);
960 		sess = curlwp->l_proc->p_session;
961 		if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) {
962 			mutex_spin_enter(&tty_lock);
963 			sess->s_ttyvp = NULL;
964 			if (sess->s_ttyp->t_session != NULL) {
965 				sess->s_ttyp->t_pgrp = NULL;
966 				sess->s_ttyp->t_session = NULL;
967 				mutex_spin_exit(&tty_lock);
968 				/* Releases proc_lock. */
969 				proc_sessrele(sess);
970 			} else {
971 				mutex_spin_exit(&tty_lock);
972 				if (sess->s_ttyp->t_pgrp != NULL)
973 					panic("spec_close: spurious pgrp ref");
974 				mutex_exit(proc_lock);
975 			}
976 			vrele(vp);
977 		} else
978 			mutex_exit(proc_lock);
979 
980 		/*
981 		 * If the vnode is locked, then we are in the midst
982 		 * of forcably closing the device, otherwise we only
983 		 * close on last reference.
984 		 */
985 		mode = S_IFCHR;
986 		break;
987 
988 	case VBLK:
989 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
990 		/*
991 		 * On last close of a block device (that isn't mounted)
992 		 * we must invalidate any in core blocks, so that
993 		 * we can, for instance, change floppy disks.
994 		 */
995 		error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0);
996 		if (error)
997 			return (error);
998 		/*
999 		 * We do not want to really close the device if it
1000 		 * is still in use unless we are trying to close it
1001 		 * forcibly. Since every use (buffer, vnode, swap, cmap)
1002 		 * holds a reference to the vnode, and because we mark
1003 		 * any other vnodes that alias this device, when the
1004 		 * sum of the reference counts on all the aliased
1005 		 * vnodes descends to one, we are on last close.
1006 		 */
1007 		mode = S_IFBLK;
1008 		break;
1009 
1010 	default:
1011 		panic("spec_close: not special");
1012 	}
1013 
1014 	mutex_enter(&device_lock);
1015 	sn->sn_opencnt--;
1016 	count = --sd->sd_opencnt;
1017 	if (vp->v_type == VBLK)
1018 		sd->sd_bdevvp = NULL;
1019 	mutex_exit(&device_lock);
1020 
1021 	if (count != 0)
1022 		return 0;
1023 
1024 	flags1 = ap->a_fflag;
1025 
1026 	/*
1027 	 * if VI_XLOCK is set, then we're going away soon, so make this
1028 	 * non-blocking. Also ensures that we won't wedge in vn_lock below.
1029 	 */
1030 	if (flags & VI_XLOCK)
1031 		flags1 |= FNONBLOCK;
1032 
1033 	/*
1034 	 * If we're able to block, release the vnode lock & reacquire. We
1035 	 * might end up sleeping for someone else who wants our queues. They
1036 	 * won't get them if we hold the vnode locked. Also, if VI_XLOCK is
1037 	 * set, don't release the lock as we won't be able to regain it.
1038 	 */
1039 	if (!(flags1 & FNONBLOCK))
1040 		VOP_UNLOCK(vp, 0);
1041 
1042 	if (vp->v_type == VBLK)
1043 		error = bdev_close(dev, flags1, mode, curlwp);
1044 	else
1045 		error = cdev_close(dev, flags1, mode, curlwp);
1046 
1047 	if (!(flags1 & FNONBLOCK))
1048 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1049 
1050 	return (error);
1051 }
1052 
1053 /*
1054  * Print out the contents of a special device vnode.
1055  */
1056 int
1057 spec_print(void *v)
1058 {
1059 	struct vop_print_args /* {
1060 		struct vnode *a_vp;
1061 	} */ *ap = v;
1062 
1063 	printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev),
1064 	    (unsigned long long)minor(ap->a_vp->v_rdev));
1065 	return 0;
1066 }
1067 
1068 /*
1069  * Return POSIX pathconf information applicable to special devices.
1070  */
1071 int
1072 spec_pathconf(void *v)
1073 {
1074 	struct vop_pathconf_args /* {
1075 		struct vnode *a_vp;
1076 		int a_name;
1077 		register_t *a_retval;
1078 	} */ *ap = v;
1079 
1080 	switch (ap->a_name) {
1081 	case _PC_LINK_MAX:
1082 		*ap->a_retval = LINK_MAX;
1083 		return (0);
1084 	case _PC_MAX_CANON:
1085 		*ap->a_retval = MAX_CANON;
1086 		return (0);
1087 	case _PC_MAX_INPUT:
1088 		*ap->a_retval = MAX_INPUT;
1089 		return (0);
1090 	case _PC_PIPE_BUF:
1091 		*ap->a_retval = PIPE_BUF;
1092 		return (0);
1093 	case _PC_CHOWN_RESTRICTED:
1094 		*ap->a_retval = 1;
1095 		return (0);
1096 	case _PC_VDISABLE:
1097 		*ap->a_retval = _POSIX_VDISABLE;
1098 		return (0);
1099 	case _PC_SYNC_IO:
1100 		*ap->a_retval = 1;
1101 		return (0);
1102 	default:
1103 		return (EINVAL);
1104 	}
1105 	/* NOTREACHED */
1106 }
1107 
1108 /*
1109  * Advisory record locking support.
1110  */
1111 int
1112 spec_advlock(void *v)
1113 {
1114 	struct vop_advlock_args /* {
1115 		struct vnode *a_vp;
1116 		void *a_id;
1117 		int a_op;
1118 		struct flock *a_fl;
1119 		int a_flags;
1120 	} */ *ap = v;
1121 	struct vnode *vp = ap->a_vp;
1122 
1123 	return lf_advlock(ap, &vp->v_speclockf, (off_t)0);
1124 }
1125