xref: /openbsd-src/sys/dev/vnd.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /*	$OpenBSD: vnd.c,v 1.95 2009/08/24 08:51:18 jasper Exp $	*/
2 /*	$NetBSD: vnd.c,v 1.26 1996/03/30 23:06:11 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1988 University of Utah.
6  * Copyright (c) 1990, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * from: Utah $Hdr: vn.c 1.13 94/04/02$
38  *
39  *	@(#)vn.c	8.6 (Berkeley) 4/1/94
40  */
41 
42 /*
43  * Vnode disk driver.
44  *
45  * Block/character interface to a vnode.  Allows one to treat a file
46  * as a disk (e.g. build a filesystem in it, mount it, etc.).
47  *
48  * NOTE 1: This uses either the VOP_BMAP/VOP_STRATEGY interface to the
49  * vnode or simple VOP_READ/VOP_WRITE.  The former is suitable for swapping
50  * as it doesn't distort the local buffer cache.  The latter is good for
51  * building disk images as it keeps the cache consistent after the block
52  * device is closed.
53  *
54  * NOTE 2: There is a security issue involved with this driver.
55  * Once mounted all access to the contents of the "mapped" file via
56  * the special file is controlled by the permissions on the special
57  * file, the protection of the mapped file is ignored (effectively,
58  * by using root credentials in all transactions).
59  *
60  * NOTE 3: Doesn't interact with leases, should it?
61  */
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/namei.h>
66 #include <sys/proc.h>
67 #include <sys/errno.h>
68 #include <sys/buf.h>
69 #include <sys/malloc.h>
70 #include <sys/pool.h>
71 #include <sys/ioctl.h>
72 #include <sys/disklabel.h>
73 #include <sys/device.h>
74 #include <sys/disk.h>
75 #include <sys/stat.h>
76 #include <sys/mount.h>
77 #include <sys/vnode.h>
78 #include <sys/file.h>
79 #include <sys/rwlock.h>
80 #include <sys/uio.h>
81 #include <sys/conf.h>
82 
83 #include <crypto/blf.h>
84 
85 #include <miscfs/specfs/specdev.h>
86 
87 #include <dev/vndioctl.h>
88 
89 #ifdef VNDDEBUG
90 int dovndcluster = 1;
91 int vnddebug = 0x00;
92 #define	VDB_FOLLOW	0x01
93 #define	VDB_INIT	0x02
94 #define	VDB_IO		0x04
95 #define	DNPRINTF(f, p...)	do { if ((f) & vnddebug) printf(p); } while (0)
96 #else
97 #define	DNPRINTF(f, p...)	/* nothing */
98 #endif	/* VNDDEBUG */
99 
100 /*
101  * vndunit is a bit weird.  have to reconstitute the dev_t for
102  * DISKUNIT(), but with the minor masked off.
103  */
104 #define	vndunit(x)	DISKUNIT(makedev(major(x), minor(x) & 0x7ff))
105 #define	vndsimple(x)	(minor(x) & 0x800)
106 
107 /* same as MAKEDISKDEV, preserving the vndsimple() property */
108 #define	VNDLABELDEV(dev)	\
109 	makedev(major(dev), DISKMINOR(vndunit(dev), RAW_PART) | \
110 	    (vndsimple(dev) ? 0x800 : 0))
111 
112 struct vndbuf {
113 	struct buf	vb_buf;
114 	struct buf	*vb_obp;
115 };
116 
117 /*
118  * struct vndbuf allocator
119  */
120 struct pool     vndbufpl;
121 
122 #define	getvndbuf()	pool_get(&vndbufpl, PR_WAITOK)
123 #define	putvndbuf(vbp)	pool_put(&vndbufpl, vbp);
124 
125 struct vnd_softc {
126 	struct device	 sc_dev;
127 	struct disk	 sc_dk;
128 
129 	char		 sc_file[VNDNLEN];	/* file we're covering */
130 	int		 sc_flags;		/* flags */
131 	size_t		 sc_size;		/* size of vnd in sectors */
132 	size_t		 sc_secsize;		/* sector size in bytes */
133 	size_t		 sc_nsectors;		/* # of sectors per track */
134 	size_t		 sc_ntracks;		/* # of tracks per cylinder */
135 	struct vnode	*sc_vp;			/* vnode */
136 	struct ucred	*sc_cred;		/* credentials */
137 	struct buf	 sc_tab;		/* transfer queue */
138 	blf_ctx		*sc_keyctx;		/* key context */
139 	struct rwlock	 sc_rwlock;
140 };
141 
142 /* sc_flags */
143 #define	VNF_ALIVE	0x0001
144 #define	VNF_INITED	0x0002
145 #define	VNF_LABELLING	0x0100
146 #define	VNF_WLABEL	0x0200
147 #define	VNF_HAVELABEL	0x0400
148 #define	VNF_SIMPLE	0x1000
149 #define	VNF_READONLY	0x2000
150 
151 #define	VNDRW(v)	((v)->sc_flags & VNF_READONLY ? FREAD : FREAD|FWRITE)
152 
153 struct vnd_softc *vnd_softc;
154 int numvnd = 0;
155 
156 struct dkdriver vnddkdriver = { vndstrategy };
157 
158 /* called by main() at boot time */
159 void	vndattach(int);
160 
161 void	vndclear(struct vnd_softc *);
162 void	vndstart(struct vnd_softc *);
163 int	vndsetcred(struct vnd_softc *, struct ucred *);
164 void	vndiodone(struct buf *);
165 void	vndshutdown(void);
166 int	vndgetdisklabel(dev_t, struct vnd_softc *, struct disklabel *, int);
167 void	vndencrypt(struct vnd_softc *, caddr_t, size_t, daddr64_t, int);
168 size_t	vndbdevsize(struct vnode *, struct proc *);
169 
170 #define vndlock(sc) rw_enter(&sc->sc_rwlock, RW_WRITE|RW_INTR)
171 #define vndunlock(sc) rw_exit_write(&sc->sc_rwlock)
172 
173 void
174 vndencrypt(struct vnd_softc *vnd, caddr_t addr, size_t size, daddr64_t off,
175     int encrypt)
176 {
177 	int i, bsize;
178 	u_char iv[8];
179 
180 	bsize = dbtob(1);
181 	for (i = 0; i < size/bsize; i++) {
182 		bzero(iv, sizeof(iv));
183 		bcopy((u_char *)&off, iv, sizeof(off));
184 		blf_ecb_encrypt(vnd->sc_keyctx, iv, sizeof(iv));
185 		if (encrypt)
186 			blf_cbc_encrypt(vnd->sc_keyctx, iv, addr, bsize);
187 		else
188 			blf_cbc_decrypt(vnd->sc_keyctx, iv, addr, bsize);
189 
190 		addr += bsize;
191 		off++;
192 	}
193 }
194 
195 void
196 vndattach(int num)
197 {
198 	char *mem;
199 	u_long size;
200 	int i;
201 
202 	if (num <= 0)
203 		return;
204 	size = num * sizeof(struct vnd_softc);
205 	mem = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
206 	if (mem == NULL) {
207 		printf("WARNING: no memory for vnode disks\n");
208 		return;
209 	}
210 	vnd_softc = (struct vnd_softc *)mem;
211 	for (i = 0; i < num; i++) {
212 		rw_init(&vnd_softc[i].sc_rwlock, "vndlock");
213 	}
214 	numvnd = num;
215 
216 	pool_init(&vndbufpl, sizeof(struct vndbuf), 0, 0, 0, "vndbufpl", NULL);
217 	pool_setlowat(&vndbufpl, 16);
218 	pool_sethiwat(&vndbufpl, 1024);
219 }
220 
221 int
222 vndopen(dev_t dev, int flags, int mode, struct proc *p)
223 {
224 	int unit = vndunit(dev);
225 	struct vnd_softc *sc;
226 	int error = 0, part, pmask;
227 
228 	DNPRINTF(VDB_FOLLOW, "vndopen(%x, %x, %x, %p)\n", dev, flags, mode, p);
229 
230 	if (unit >= numvnd)
231 		return (ENXIO);
232 	sc = &vnd_softc[unit];
233 
234 	if ((error = vndlock(sc)) != 0)
235 		return (error);
236 
237 	if (!vndsimple(dev) && sc->sc_vp != NULL &&
238 	    (sc->sc_vp->v_type != VREG || sc->sc_keyctx != NULL)) {
239 		error = EINVAL;
240 		goto bad;
241 	}
242 
243 	if ((flags & FWRITE) && (sc->sc_flags & VNF_READONLY)) {
244 		error = EROFS;
245 		goto bad;
246 	}
247 
248 	if ((sc->sc_flags & VNF_INITED) &&
249 	    (sc->sc_flags & VNF_HAVELABEL) == 0) {
250 		sc->sc_flags |= VNF_HAVELABEL;
251 		vndgetdisklabel(dev, sc, sc->sc_dk.dk_label, 0);
252 	}
253 
254 	part = DISKPART(dev);
255 	pmask = 1 << part;
256 
257 	/*
258 	 * If any partition is open, all succeeding openings must be of the
259 	 * same type or read-only.
260 	 */
261 	if (sc->sc_dk.dk_openmask) {
262 		if (((sc->sc_flags & VNF_SIMPLE) != 0) !=
263 		    (vndsimple(dev) != 0) && (flags & FWRITE)) {
264 			error = EBUSY;
265 			goto bad;
266 		}
267 	} else if (vndsimple(dev))
268 		sc->sc_flags |= VNF_SIMPLE;
269 	else
270 		sc->sc_flags &= ~VNF_SIMPLE;
271 
272 	/* Check that the partition exists. */
273 	if (part != RAW_PART &&
274 	    ((sc->sc_flags & VNF_HAVELABEL) == 0 ||
275 	    part >= sc->sc_dk.dk_label->d_npartitions ||
276 	    sc->sc_dk.dk_label->d_partitions[part].p_fstype == FS_UNUSED)) {
277 		error = ENXIO;
278 		goto bad;
279 	}
280 
281 	/* Prevent our unit from being unconfigured while open. */
282 	switch (mode) {
283 	case S_IFCHR:
284 		sc->sc_dk.dk_copenmask |= pmask;
285 		break;
286 
287 	case S_IFBLK:
288 		sc->sc_dk.dk_bopenmask |= pmask;
289 		break;
290 	}
291 	sc->sc_dk.dk_openmask =
292 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
293 
294 	error = 0;
295 bad:
296 	vndunlock(sc);
297 	return (error);
298 }
299 
300 /*
301  * Load the label information on the named device
302  */
303 int
304 vndgetdisklabel(dev_t dev, struct vnd_softc *sc, struct disklabel *lp,
305     int spoofonly)
306 {
307 	bzero(lp, sizeof(struct disklabel));
308 
309 	lp->d_secsize = sc->sc_secsize;
310 	lp->d_nsectors = sc->sc_nsectors;
311 	lp->d_ntracks = sc->sc_ntracks;
312 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
313 	lp->d_ncylinders = sc->sc_size / lp->d_secpercyl;
314 
315 	strncpy(lp->d_typename, "vnd device", sizeof(lp->d_typename));
316 	lp->d_type = DTYPE_VND;
317 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
318 	DL_SETDSIZE(lp, sc->sc_size);
319 	lp->d_rpm = 3600;
320 	lp->d_interleave = 1;
321 	lp->d_flags = 0;
322 	lp->d_version = 1;
323 
324 	lp->d_magic = DISKMAGIC;
325 	lp->d_magic2 = DISKMAGIC;
326 	lp->d_checksum = dkcksum(lp);
327 
328 	/* Call the generic disklabel extraction routine */
329 	return readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, spoofonly);
330 }
331 
332 int
333 vndclose(dev_t dev, int flags, int mode, struct proc *p)
334 {
335 	int unit = vndunit(dev);
336 	struct vnd_softc *sc;
337 	int error = 0, part;
338 
339 	DNPRINTF(VDB_FOLLOW, "vndclose(%x, %x, %x, %p)\n", dev, flags, mode, p);
340 
341 	if (unit >= numvnd)
342 		return (ENXIO);
343 	sc = &vnd_softc[unit];
344 
345 	if ((error = vndlock(sc)) != 0)
346 		return (error);
347 
348 	part = DISKPART(dev);
349 
350 	/* ...that much closer to allowing unconfiguration... */
351 	switch (mode) {
352 	case S_IFCHR:
353 		sc->sc_dk.dk_copenmask &= ~(1 << part);
354 		break;
355 
356 	case S_IFBLK:
357 		sc->sc_dk.dk_bopenmask &= ~(1 << part);
358 		break;
359 	}
360 	sc->sc_dk.dk_openmask =
361 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
362 
363 	vndunlock(sc);
364 	return (0);
365 }
366 
367 /*
368  * Two methods are used, the traditional buffercache bypassing and the
369  * newer, cache-coherent on unmount, one.
370  *
371  * Former method:
372  * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
373  * Note that this driver can only be used for swapping over NFS on the hp
374  * since nfs_strategy on the vax cannot handle u-areas and page tables.
375  *
376  * Latter method:
377  * Repack the buffer into an uio structure and use VOP_READ/VOP_WRITE to
378  * access the underlying file.
379  */
380 void
381 vndstrategy(struct buf *bp)
382 {
383 	int unit = vndunit(bp->b_dev);
384 	struct vnd_softc *vnd = &vnd_softc[unit];
385 	struct vndbuf *nbp;
386 	int bsize;
387 	off_t bn;
388 	caddr_t addr;
389 	size_t resid;
390 	int sz, flags, error, s;
391 	struct iovec aiov;
392 	struct uio auio;
393 	struct proc *p = curproc;
394 
395 	DNPRINTF(VDB_FOLLOW, "vndstrategy(%p): unit %d\n", bp, unit);
396 
397 	if ((vnd->sc_flags & VNF_INITED) == 0) {
398 		bp->b_error = ENXIO;
399 		bp->b_flags |= B_ERROR;
400 		s = splbio();
401 		biodone(bp);
402 		splx(s);
403 		return;
404 	}
405 
406 	/* Ensure that the requested block is sector aligned. */
407 	if (bp->b_blkno % DL_BLKSPERSEC(vnd->sc_dk.dk_label) != 0) {
408 		bp->b_error = EINVAL;
409 		bp->b_flags |= B_ERROR;
410 		s = splbio();
411 		biodone(bp);
412 		splx(s);
413 		return;
414 	}
415 
416 	bn = bp->b_blkno;
417 	bp->b_resid = bp->b_bcount;
418 
419 	if (bn < 0) {
420 		bp->b_error = EINVAL;
421 		bp->b_flags |= B_ERROR;
422 		s = splbio();
423 		biodone(bp);
424 		splx(s);
425 		return;
426 	}
427 
428 	/* If we have a label, do a boundary check. */
429 	if (vnd->sc_flags & VNF_HAVELABEL) {
430 		if (bounds_check_with_label(bp, vnd->sc_dk.dk_label, 1) <= 0) {
431 			s = splbio();
432 			biodone(bp);
433 			splx(s);
434 			return;
435 		}
436 
437 		/*
438 		 * bounds_check_with_label() changes bp->b_resid, reset it
439 		 */
440 		bp->b_resid = bp->b_bcount;
441 	}
442 
443 	if (vnd->sc_flags & VNF_HAVELABEL)
444 		sz = howmany(bp->b_bcount, vnd->sc_dk.dk_label->d_secsize);
445 	else
446 		sz = howmany(bp->b_bcount, DEV_BSIZE);
447 
448 	/* No bypassing of buffer cache?  */
449 	if (vndsimple(bp->b_dev)) {
450 		/* Loop until all queued requests are handled.  */
451 		for (;;) {
452 			int part = DISKPART(bp->b_dev);
453 			daddr64_t off = DL_SECTOBLK(vnd->sc_dk.dk_label,
454 			    DL_GETPOFFSET(&vnd->sc_dk.dk_label->d_partitions[part]));
455 			aiov.iov_base = bp->b_data;
456 			auio.uio_resid = aiov.iov_len = bp->b_bcount;
457 			auio.uio_iov = &aiov;
458 			auio.uio_iovcnt = 1;
459 			auio.uio_offset = dbtob((off_t)(bp->b_blkno + off));
460 			auio.uio_segflg = UIO_SYSSPACE;
461 			auio.uio_procp = p;
462 
463 			vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
464 			if (bp->b_flags & B_READ) {
465 				auio.uio_rw = UIO_READ;
466 				bp->b_error = VOP_READ(vnd->sc_vp, &auio, 0,
467 				    vnd->sc_cred);
468 				if (vnd->sc_keyctx)
469 					vndencrypt(vnd,	bp->b_data,
470 					   bp->b_bcount, bp->b_blkno, 0);
471 			} else {
472 				if (vnd->sc_keyctx)
473 					vndencrypt(vnd, bp->b_data,
474 					   bp->b_bcount, bp->b_blkno, 1);
475 				auio.uio_rw = UIO_WRITE;
476 				/*
477 				 * Upper layer has already checked I/O for
478 				 * limits, so there is no need to do it again.
479 				 */
480 				bp->b_error = VOP_WRITE(vnd->sc_vp, &auio,
481 				    IO_NOLIMIT, vnd->sc_cred);
482 				/* Data in buffer cache needs to be in clear */
483 				if (vnd->sc_keyctx)
484 					vndencrypt(vnd, bp->b_data,
485 					   bp->b_bcount, bp->b_blkno, 0);
486 			}
487 			VOP_UNLOCK(vnd->sc_vp, 0, p);
488 			if (bp->b_error)
489 				bp->b_flags |= B_ERROR;
490 			bp->b_resid = auio.uio_resid;
491 			s = splbio();
492 			biodone(bp);
493 			splx(s);
494 
495 			/* If nothing more is queued, we are done.  */
496 			if (!vnd->sc_tab.b_active)
497 				return;
498 
499 			/*
500 			 * Dequeue now since lower level strategy
501 			 * routine might queue using same links.
502 			 */
503 			s = splbio();
504 			bp = vnd->sc_tab.b_actf;
505 			vnd->sc_tab.b_actf = bp->b_actf;
506 			vnd->sc_tab.b_active--;
507 			splx(s);
508 		}
509 	}
510 
511 	if (vnd->sc_vp->v_type != VREG || vnd->sc_keyctx != NULL) {
512 		bp->b_error = EINVAL;
513 		bp->b_flags |= B_ERROR;
514 		s = splbio();
515 		biodone(bp);
516 		splx(s);
517 		return;
518 	}
519 
520 	/* The old-style buffercache bypassing method.  */
521 	bn += DL_SECTOBLK(vnd->sc_dk.dk_label,
522 	    DL_GETPOFFSET(&vnd->sc_dk.dk_label->d_partitions[DISKPART(bp->b_dev)]));
523 	bn = dbtob(bn);
524 	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
525 	addr = bp->b_data;
526 	flags = bp->b_flags | B_CALL;
527 	for (resid = bp->b_resid; resid; resid -= sz) {
528 		struct vnode *vp;
529 		daddr64_t nbn;
530 		int off, nra;
531 
532 		nra = 0;
533 		vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
534 		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
535 		VOP_UNLOCK(vnd->sc_vp, 0, p);
536 		if (error == 0 && (long)nbn == -1)
537 			error = EIO;
538 #ifdef VNDDEBUG
539 		if (!dovndcluster)
540 			nra = 0;
541 #endif
542 
543 		if ((off = bn % bsize) != 0)
544 			sz = bsize - off;
545 		else
546 			sz = (1 + nra) * bsize;
547 		if (resid < sz)
548 			sz = resid;
549 
550 		DNPRINTF(VDB_IO, "vndstrategy: vp %p/%p bn %x/%lld sz %x\n",
551 		    vnd->sc_vp, vp, bn, nbn, sz);
552 
553 		s = splbio();
554 		nbp = getvndbuf();
555 		splx(s);
556 		nbp->vb_buf.b_flags = flags;
557 		nbp->vb_buf.b_bcount = sz;
558 		nbp->vb_buf.b_bufsize = bp->b_bufsize;
559 		nbp->vb_buf.b_error = 0;
560 		if (vp->v_type == VBLK || vp->v_type == VCHR)
561 			nbp->vb_buf.b_dev = vp->v_rdev;
562 		else
563 			nbp->vb_buf.b_dev = NODEV;
564 		nbp->vb_buf.b_data = addr;
565 		nbp->vb_buf.b_blkno = nbn + btodb(off);
566 		nbp->vb_buf.b_proc = bp->b_proc;
567 		nbp->vb_buf.b_iodone = vndiodone;
568 		nbp->vb_buf.b_vp = vp;
569 		nbp->vb_buf.b_dirtyoff = bp->b_dirtyoff;
570 		nbp->vb_buf.b_dirtyend = bp->b_dirtyend;
571 		nbp->vb_buf.b_validoff = bp->b_validoff;
572 		nbp->vb_buf.b_validend = bp->b_validend;
573 		LIST_INIT(&nbp->vb_buf.b_dep);
574 
575 		/* save a reference to the old buffer */
576 		nbp->vb_obp = bp;
577 
578 		/*
579 		 * If there was an error or a hole in the file...punt.
580 		 * Note that we deal with this after the nbp allocation.
581 		 * This ensures that we properly clean up any operations
582 		 * that we have already fired off.
583 		 *
584 		 * XXX we could deal with holes here but it would be
585 		 * a hassle (in the write case).
586 		 * We must still however charge for the write even if there
587 		 * was an error.
588 		 */
589 		if (error) {
590 			nbp->vb_buf.b_error = error;
591 			nbp->vb_buf.b_flags |= B_ERROR;
592 			bp->b_resid -= (resid - sz);
593 			s = splbio();
594 			/* charge for the write */
595 			if ((nbp->vb_buf.b_flags & B_READ) == 0)
596 				nbp->vb_buf.b_vp->v_numoutput++;
597 			biodone(&nbp->vb_buf);
598 			splx(s);
599 			return;
600 		}
601 		/*
602 		 * Just sort by block number
603 		 */
604 		nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
605 		s = splbio();
606 		disksort(&vnd->sc_tab, &nbp->vb_buf);
607 		vnd->sc_tab.b_active++;
608 		vndstart(vnd);
609 		splx(s);
610 		bn += sz;
611 		addr += sz;
612 	}
613 }
614 
615 /*
616  * Feed requests sequentially.
617  * We do it this way to keep from flooding NFS servers if we are connected
618  * to an NFS file.  This places the burden on the client rather than the
619  * server.
620  */
621 void
622 vndstart(struct vnd_softc *vnd)
623 {
624 	struct buf *bp;
625 
626 	/*
627 	 * Dequeue now since lower level strategy routine might
628 	 * queue using same links
629 	 */
630 	bp = vnd->sc_tab.b_actf;
631 	vnd->sc_tab.b_actf = bp->b_actf;
632 
633 	DNPRINTF(VDB_IO,
634 	    "vndstart(%d): bp %p vp %p blkno %lld addr %p cnt %lx\n",
635 	    vnd-vnd_softc, bp, bp->b_vp, bp->b_blkno, bp->b_data,
636 	    bp->b_bcount);
637 
638 	/* Instrumentation. */
639 	disk_busy(&vnd->sc_dk);
640 
641 	if ((bp->b_flags & B_READ) == 0)
642 		bp->b_vp->v_numoutput++;
643 	VOP_STRATEGY(bp);
644 }
645 
646 void
647 vndiodone(struct buf *bp)
648 {
649 	struct vndbuf *vbp = (struct vndbuf *) bp;
650 	struct buf *pbp = vbp->vb_obp;
651 	struct vnd_softc *vnd = &vnd_softc[vndunit(pbp->b_dev)];
652 
653 	splassert(IPL_BIO);
654 
655 	DNPRINTF(VDB_IO,
656 	    "vndiodone(%d): vbp %p vp %p blkno %lld addr %p cnt %lx\n",
657 	    vnd-vnd_softc, vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
658 	    vbp->vb_buf.b_data, vbp->vb_buf.b_bcount);
659 
660 	if (vbp->vb_buf.b_error) {
661 		DNPRINTF(VDB_IO, "vndiodone: vbp %p error %d\n", vbp,
662 		    vbp->vb_buf.b_error);
663 
664 		pbp->b_flags |= B_ERROR;
665 		/* XXX does this matter here? */
666 		(&vbp->vb_buf)->b_flags |= B_RAW;
667 		pbp->b_error = biowait(&vbp->vb_buf);
668 	}
669 	pbp->b_resid -= vbp->vb_buf.b_bcount;
670 	putvndbuf(vbp);
671 	if (vnd->sc_tab.b_active) {
672 		disk_unbusy(&vnd->sc_dk, (pbp->b_bcount - pbp->b_resid),
673 		    (pbp->b_flags & B_READ));
674 		if (!vnd->sc_tab.b_actf)
675 			vnd->sc_tab.b_active--;
676 	}
677 	if (pbp->b_resid == 0) {
678 		DNPRINTF(VDB_IO, "vndiodone: pbp %p iodone\n", pbp);
679 		biodone(pbp);
680 	}
681 
682 }
683 
684 /* ARGSUSED */
685 int
686 vndread(dev_t dev, struct uio *uio, int flags)
687 {
688 	int unit = vndunit(dev);
689 	struct vnd_softc *sc;
690 
691 	DNPRINTF(VDB_FOLLOW, "vndread(%x, %p)\n", dev, uio);
692 
693 	if (unit >= numvnd)
694 		return (ENXIO);
695 	sc = &vnd_softc[unit];
696 
697 	if ((sc->sc_flags & VNF_INITED) == 0)
698 		return (ENXIO);
699 
700 	return (physio(vndstrategy, NULL, dev, B_READ, minphys, uio));
701 }
702 
703 /* ARGSUSED */
704 int
705 vndwrite(dev_t dev, struct uio *uio, int flags)
706 {
707 	int unit = vndunit(dev);
708 	struct vnd_softc *sc;
709 
710 	DNPRINTF(VDB_FOLLOW, "vndwrite(%x, %p)\n", dev, uio);
711 
712 	if (unit >= numvnd)
713 		return (ENXIO);
714 	sc = &vnd_softc[unit];
715 
716 	if ((sc->sc_flags & VNF_INITED) == 0)
717 		return (ENXIO);
718 
719 	return (physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio));
720 }
721 
722 size_t
723 vndbdevsize(struct vnode *vp, struct proc *p)
724 {
725 	struct partinfo pi;
726 	struct bdevsw *bsw;
727 	dev_t dev;
728 
729 	dev = vp->v_rdev;
730 	bsw = bdevsw_lookup(dev);
731 	if (bsw->d_ioctl == NULL)
732 		return (0);
733 	if (bsw->d_ioctl(dev, DIOCGPART, (caddr_t)&pi, FREAD, p))
734 		return (0);
735 	DNPRINTF(VDB_INIT, "vndbdevsize: size %li secsize %li\n",
736 	    (long)pi.part->p_size,(long)pi.disklab->d_secsize);
737 	return (pi.part->p_size);
738 }
739 
740 /* ARGSUSED */
741 int
742 vndioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
743 {
744 	int unit = vndunit(dev);
745 	struct disklabel *lp;
746 	struct vnd_softc *vnd;
747 	struct vnd_ioctl *vio;
748 	struct vnd_user *vnu;
749 	struct vattr vattr;
750 	struct nameidata nd;
751 	int error, part, pmask, s;
752 
753 	DNPRINTF(VDB_FOLLOW, "vndioctl(%x, %lx, %p, %x, %p): unit %d\n",
754 	    dev, cmd, addr, flag, p, unit);
755 
756 	error = suser(p, 0);
757 	if (error)
758 		return (error);
759 	if (unit >= numvnd)
760 		return (ENXIO);
761 
762 	vnd = &vnd_softc[unit];
763 	vio = (struct vnd_ioctl *)addr;
764 	switch (cmd) {
765 
766 	case VNDIOCSET:
767 		if (vnd->sc_flags & VNF_INITED)
768 			return (EBUSY);
769 		if (!(vnd->sc_flags & VNF_SIMPLE) && vio->vnd_keylen)
770 			return (EINVAL);
771 
772 		if ((error = vndlock(vnd)) != 0)
773 			return (error);
774 
775 		if ((error = copyinstr(vio->vnd_file, vnd->sc_file,
776 		    sizeof(vnd->sc_file), NULL))) {
777 			vndunlock(vnd);
778 			return (error);
779 		}
780 
781 		bzero(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname));
782 		if (snprintf(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname),
783 		    "vnd%d", unit) >= sizeof(vnd->sc_dev.dv_xname)) {
784 			printf("VNDIOCSET: device name too long\n");
785 			vndunlock(vnd);
786 			return(ENXIO);
787 		}
788 
789 		/* Set geometry for device. */
790 		vnd->sc_secsize = vio->vnd_secsize;
791 		vnd->sc_ntracks = vio->vnd_ntracks;
792 		vnd->sc_nsectors = vio->vnd_nsectors;
793 
794 		/*
795 		 * Open for read and write first. This lets vn_open() weed out
796 		 * directories, sockets, etc. so we don't have to worry about
797 		 * them.
798 		 */
799 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vnd_file, p);
800 		vnd->sc_flags &= ~VNF_READONLY;
801 		error = vn_open(&nd, FREAD|FWRITE, 0);
802 		if (error == EROFS) {
803 			vnd->sc_flags |= VNF_READONLY;
804 			error = vn_open(&nd, FREAD, 0);
805 		}
806 		if (error) {
807 			vndunlock(vnd);
808 			return (error);
809 		}
810 
811 		if (nd.ni_vp->v_type != VREG && !vndsimple(dev)) {
812 			VOP_UNLOCK(nd.ni_vp, 0, p);
813 			vn_close(nd.ni_vp, VNDRW(vnd), p->p_ucred, p);
814 			vndunlock(vnd);
815 			return (EINVAL);
816 		}
817 
818 		if (nd.ni_vp->v_type == VBLK)
819 			vnd->sc_size = vndbdevsize(nd.ni_vp, p);
820 		else {
821 			error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p);
822 			if (error) {
823 				VOP_UNLOCK(nd.ni_vp, 0, p);
824 				vn_close(nd.ni_vp, VNDRW(vnd), p->p_ucred, p);
825 				vndunlock(vnd);
826 				return (error);
827 			}
828 			vnd->sc_size = vattr.va_size / vnd->sc_secsize;
829 		}
830 		VOP_UNLOCK(nd.ni_vp, 0, p);
831 		vnd->sc_vp = nd.ni_vp;
832 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0) {
833 			(void) vn_close(nd.ni_vp, VNDRW(vnd), p->p_ucred, p);
834 			vndunlock(vnd);
835 			return (error);
836 		}
837 
838 		if (vio->vnd_keylen > 0) {
839 			char key[BLF_MAXUTILIZED];
840 
841 			if (vio->vnd_keylen > sizeof(key))
842 				vio->vnd_keylen = sizeof(key);
843 
844 			if ((error = copyin(vio->vnd_key, key,
845 			    vio->vnd_keylen)) != 0) {
846 				(void) vn_close(nd.ni_vp, VNDRW(vnd),
847 				    p->p_ucred, p);
848 				vndunlock(vnd);
849 				return (error);
850 			}
851 
852 			vnd->sc_keyctx = malloc(sizeof(*vnd->sc_keyctx), M_DEVBUF,
853 			    M_WAITOK);
854 			blf_key(vnd->sc_keyctx, key, vio->vnd_keylen);
855 			bzero(key, vio->vnd_keylen);
856 		} else
857 			vnd->sc_keyctx = NULL;
858 
859 		vio->vnd_size = vnd->sc_size * vnd->sc_secsize;
860 		vnd->sc_flags |= VNF_INITED;
861 
862 		DNPRINTF(VDB_INIT, "vndioctl: SET vp %p size %llx\n",
863 		    vnd->sc_vp, (unsigned long long)vnd->sc_size);
864 
865 		/* Attach the disk. */
866 		vnd->sc_dk.dk_driver = &vnddkdriver;
867 		vnd->sc_dk.dk_name = vnd->sc_dev.dv_xname;
868 		disk_attach(&vnd->sc_dk);
869 
870 		vndunlock(vnd);
871 
872 		break;
873 
874 	case VNDIOCCLR:
875 		if ((vnd->sc_flags & VNF_INITED) == 0)
876 			return (ENXIO);
877 
878 		if ((error = vndlock(vnd)) != 0)
879 			return (error);
880 
881 		/*
882 		 * Don't unconfigure if any other partitions are open
883 		 * or if both the character and block flavors of this
884 		 * partition are open.
885 		 */
886 		part = DISKPART(dev);
887 		pmask = (1 << part);
888 		if ((vnd->sc_dk.dk_openmask & ~pmask) ||
889 		    ((vnd->sc_dk.dk_bopenmask & pmask) &&
890 		    (vnd->sc_dk.dk_copenmask & pmask))) {
891 			vndunlock(vnd);
892 			return (EBUSY);
893 		}
894 
895 		vndclear(vnd);
896 		DNPRINTF(VDB_INIT, "vndioctl: CLRed\n");
897 
898 		/* Free crypto key */
899 		if (vnd->sc_keyctx) {
900 			bzero(vnd->sc_keyctx, sizeof(*vnd->sc_keyctx));
901 			free(vnd->sc_keyctx, M_DEVBUF);
902 		}
903 
904 		/* Detach the disk. */
905 		disk_detach(&vnd->sc_dk);
906 
907 		/* This must be atomic. */
908 		s = splhigh();
909 		vndunlock(vnd);
910 		bzero(vnd, sizeof(struct vnd_softc));
911 		splx(s);
912 		break;
913 
914 	case VNDIOCGET:
915 		vnu = (struct vnd_user *)addr;
916 
917 		if (vnu->vnu_unit == -1)
918 			vnu->vnu_unit = unit;
919 		if (vnu->vnu_unit >= numvnd)
920 			return (ENXIO);
921 		if (vnu->vnu_unit < 0)
922 			return (EINVAL);
923 
924 		vnd = &vnd_softc[vnu->vnu_unit];
925 
926 		if (vnd->sc_flags & VNF_INITED) {
927 			error = VOP_GETATTR(vnd->sc_vp, &vattr, p->p_ucred, p);
928 			if (error)
929 				return (error);
930 
931 			strlcpy(vnu->vnu_file, vnd->sc_file,
932 			    sizeof(vnu->vnu_file));
933 			vnu->vnu_dev = vattr.va_fsid;
934 			vnu->vnu_ino = vattr.va_fileid;
935 		} else {
936 			vnu->vnu_dev = 0;
937 			vnu->vnu_ino = 0;
938 		}
939 
940 		break;
941 
942 	case DIOCRLDINFO:
943 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
944 			return (ENOTTY);
945 		lp = malloc(sizeof(*lp), M_TEMP, M_WAITOK);
946 		vndgetdisklabel(dev, vnd, lp, 0);
947 		*(vnd->sc_dk.dk_label) = *lp;
948 		free(lp, M_TEMP);
949 		return (0);
950 
951 	case DIOCGPDINFO:
952 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
953 			return (ENOTTY);
954 		vndgetdisklabel(dev, vnd, (struct disklabel *)addr, 1);
955 		return (0);
956 
957 	case DIOCGDINFO:
958 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
959 			return (ENOTTY);
960 		*(struct disklabel *)addr = *(vnd->sc_dk.dk_label);
961 		return (0);
962 
963 	case DIOCGPART:
964 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
965 			return (ENOTTY);
966 		((struct partinfo *)addr)->disklab = vnd->sc_dk.dk_label;
967 		((struct partinfo *)addr)->part =
968 		    &vnd->sc_dk.dk_label->d_partitions[DISKPART(dev)];
969 		return (0);
970 
971 	case DIOCWDINFO:
972 	case DIOCSDINFO:
973 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
974 			return (ENOTTY);
975 		if ((flag & FWRITE) == 0)
976 			return (EBADF);
977 
978 		if ((error = vndlock(vnd)) != 0)
979 			return (error);
980 		vnd->sc_flags |= VNF_LABELLING;
981 
982 		error = setdisklabel(vnd->sc_dk.dk_label,
983 		    (struct disklabel *)addr, /*vnd->sc_dk.dk_openmask : */0);
984 		if (error == 0) {
985 			if (cmd == DIOCWDINFO)
986 				error = writedisklabel(VNDLABELDEV(dev),
987 				    vndstrategy, vnd->sc_dk.dk_label);
988 		}
989 
990 		vnd->sc_flags &= ~VNF_LABELLING;
991 		vndunlock(vnd);
992 		return (error);
993 
994 	case DIOCWLABEL:
995 		if ((flag & FWRITE) == 0)
996 			return (EBADF);
997 		if (*(int *)addr)
998 			vnd->sc_flags |= VNF_WLABEL;
999 		else
1000 			vnd->sc_flags &= ~VNF_WLABEL;
1001 		return (0);
1002 
1003 	default:
1004 		return (ENOTTY);
1005 	}
1006 
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Duplicate the current processes' credentials.  Since we are called only
1012  * as the result of a SET ioctl and only root can do that, any future access
1013  * to this "disk" is essentially as root.  Note that credentials may change
1014  * if some other uid can write directly to the mapped file (NFS).
1015  */
1016 int
1017 vndsetcred(struct vnd_softc *vnd, struct ucred *cred)
1018 {
1019 	struct uio auio;
1020 	struct iovec aiov;
1021 	char *tmpbuf;
1022 	int error;
1023 	struct proc *p = curproc;
1024 
1025 	vnd->sc_cred = crdup(cred);
1026 	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
1027 
1028 	/* XXX: Horrible kludge to establish credentials for NFS */
1029 	aiov.iov_base = tmpbuf;
1030 	aiov.iov_len = MIN(DEV_BSIZE, vnd->sc_size * vnd->sc_secsize);
1031 	auio.uio_iov = &aiov;
1032 	auio.uio_iovcnt = 1;
1033 	auio.uio_offset = 0;
1034 	auio.uio_rw = UIO_READ;
1035 	auio.uio_segflg = UIO_SYSSPACE;
1036 	auio.uio_resid = aiov.iov_len;
1037 	vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
1038 	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
1039 	VOP_UNLOCK(vnd->sc_vp, 0, p);
1040 
1041 	free(tmpbuf, M_TEMP);
1042 	return (error);
1043 }
1044 
1045 void
1046 vndshutdown(void)
1047 {
1048 	struct vnd_softc *vnd;
1049 
1050 	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
1051 		if (vnd->sc_flags & VNF_INITED)
1052 			vndclear(vnd);
1053 }
1054 
1055 void
1056 vndclear(struct vnd_softc *vnd)
1057 {
1058 	struct vnode *vp = vnd->sc_vp;
1059 	struct proc *p = curproc;		/* XXX */
1060 
1061 	DNPRINTF(VDB_FOLLOW, "vndclear(%p): vp %p\n", vnd, vp);
1062 
1063 	vnd->sc_flags &= ~VNF_INITED;
1064 	if (vp == NULL)
1065 		panic("vndioctl: null vp");
1066 	(void) vn_close(vp, VNDRW(vnd), vnd->sc_cred, p);
1067 	crfree(vnd->sc_cred);
1068 	vnd->sc_vp = NULL;
1069 	vnd->sc_cred = NULL;
1070 	vnd->sc_size = 0;
1071 }
1072 
1073 daddr64_t
1074 vndsize(dev_t dev)
1075 {
1076 	int unit = vndunit(dev);
1077 	struct vnd_softc *vnd = &vnd_softc[unit];
1078 
1079 	if (unit >= numvnd || (vnd->sc_flags & VNF_INITED) == 0)
1080 		return (-1);
1081 	return (vnd->sc_size * (vnd->sc_secsize / DEV_BSIZE));
1082 }
1083 
1084 int
1085 vnddump(dev_t dev, daddr64_t blkno, caddr_t va, size_t size)
1086 {
1087 
1088 	/* Not implemented. */
1089 	return (ENXIO);
1090 }
1091