xref: /openbsd-src/sys/dev/vnd.c (revision 8500990981f885cbe5e6a4958549cacc238b5ae6)
1 /*	$OpenBSD: vnd.c,v 1.41 2003/10/17 23:05:39 tedu Exp $	*/
2 /*	$NetBSD: vnd.c,v 1.26 1996/03/30 23:06:11 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1988 University of Utah.
6  * Copyright (c) 1990, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * from: Utah $Hdr: vn.c 1.13 94/04/02$
38  *
39  *	@(#)vn.c	8.6 (Berkeley) 4/1/94
40  */
41 
42 /*
43  * Vnode disk driver.
44  *
45  * Block/character interface to a vnode.  Allows one to treat a file
46  * as a disk (e.g. build a filesystem in it, mount it, etc.).
47  *
48  * NOTE 1: This uses either the VOP_BMAP/VOP_STRATEGY interface to the
49  * vnode or simple VOP_READ/VOP_WRITE.  The former is suitable for swapping
50  * as it doesn't distort the local buffer cache.  The latter is good for
51  * building disk images as it keeps the cache consistent after the block
52  * device is closed.
53  *
54  * NOTE 2: There is a security issue involved with this driver.
55  * Once mounted all access to the contents of the "mapped" file via
56  * the special file is controlled by the permissions on the special
57  * file, the protection of the mapped file is ignored (effectively,
58  * by using root credentials in all transactions).
59  *
60  * NOTE 3: Doesn't interact with leases, should it?
61  */
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/namei.h>
66 #include <sys/proc.h>
67 #include <sys/errno.h>
68 #include <sys/buf.h>
69 #include <sys/malloc.h>
70 #include <sys/ioctl.h>
71 #include <sys/disklabel.h>
72 #include <sys/device.h>
73 #include <sys/disk.h>
74 #include <sys/stat.h>
75 #include <sys/mount.h>
76 #include <sys/vnode.h>
77 #include <sys/file.h>
78 #include <sys/uio.h>
79 #include <sys/conf.h>
80 
81 #include <crypto/blf.h>
82 
83 #include <miscfs/specfs/specdev.h>
84 
85 #include <dev/vndioctl.h>
86 
87 #ifdef DEBUG
88 int dovndcluster = 1;
89 int vnddebug = 0x00;
90 #define VDB_FOLLOW	0x01
91 #define VDB_INIT	0x02
92 #define VDB_IO		0x04
93 #endif
94 
95 #define b_cylin	b_resid
96 
97 /*
98  * vndunit is a bit weird.  have to reconstitute the dev_t for
99  * DISKUNIT(), but with the minor masked off.
100  */
101 #define	vndunit(x)	DISKUNIT(makedev(major(x), minor(x) & 0x7ff))
102 #define vndsimple(x)	(minor(x) & 0x800)
103 #define	MAKEVNDDEV(maj, unit, part)	MAKEDISKDEV(maj, unit, part)
104 
105 #define	VNDLABELDEV(dev) (MAKEVNDDEV(major(dev), vndunit(dev), RAW_PART))
106 
107 struct vndbuf {
108 	struct buf	vb_buf;
109 	struct buf	*vb_obp;
110 };
111 
112 #define	getvndbuf()	\
113 	((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
114 #define putvndbuf(vbp)	\
115 	free((caddr_t)(vbp), M_DEVBUF)
116 
117 struct vnd_softc {
118 	struct device	 sc_dev;
119 	struct disk	 sc_dk;
120 
121 	int		 sc_flags;	/* flags */
122 	size_t		 sc_size;	/* size of vnd in blocks */
123 	struct vnode	*sc_vp;		/* vnode */
124 	struct ucred	*sc_cred;	/* credentials */
125 	int		 sc_maxactive;	/* max # of active requests */
126 	struct buf	 sc_tab;	/* transfer queue */
127 	void		*sc_keyctx;	/* key context */
128 };
129 
130 /* sc_flags */
131 #define	VNF_ALIVE	0x0001
132 #define VNF_INITED	0x0002
133 #define VNF_WANTED	0x0040
134 #define VNF_LOCKED	0x0080
135 #define	VNF_LABELLING	0x0100
136 #define	VNF_WLABEL	0x0200
137 #define	VNF_HAVELABEL	0x0400
138 #define VNF_BUSY	0x0800
139 #define VNF_SIMPLE	0x1000
140 
141 struct vnd_softc *vnd_softc;
142 int numvnd = 0;
143 
144 struct dkdriver vnddkdriver = { vndstrategy };
145 
146 /* called by main() at boot time */
147 void	vndattach(int);
148 
149 void	vndclear(struct vnd_softc *);
150 void	vndstart(struct vnd_softc *);
151 int	vndsetcred(struct vnd_softc *, struct ucred *);
152 void	vndthrottle(struct vnd_softc *, struct vnode *);
153 void	vndiodone(struct buf *);
154 void	vndshutdown(void);
155 void	vndgetdisklabel(dev_t, struct vnd_softc *);
156 void	vndencrypt(struct vnd_softc *, caddr_t, size_t, daddr_t, int);
157 
158 int	vndlock(struct vnd_softc *);
159 void	vndunlock(struct vnd_softc *);
160 
161 void
162 vndencrypt(vnd, addr, size, off, encrypt)
163      struct vnd_softc *vnd;
164      caddr_t addr;
165      size_t size;
166      daddr_t off;
167      int encrypt;
168 {
169 	int i, bsize;
170 	u_char iv[8];
171 
172 	bsize = dbtob(1);
173 	for (i = 0; i < size/bsize; i++) {
174 		bzero(iv, sizeof(iv));
175 		bcopy((u_char *)&off, iv, sizeof(off));
176 		blf_ecb_encrypt(vnd->sc_keyctx, iv, sizeof(iv));
177 		if (encrypt)
178 			blf_cbc_encrypt(vnd->sc_keyctx, iv, addr, bsize);
179 		else
180 			blf_cbc_decrypt(vnd->sc_keyctx, iv, addr, bsize);
181 
182 		addr += bsize;
183 		off++;
184 	}
185 }
186 
187 void
188 vndattach(num)
189 	int num;
190 {
191 	char *mem;
192 	u_long size;
193 
194 	if (num <= 0)
195 		return;
196 	size = num * sizeof(struct vnd_softc);
197 	mem = malloc(size, M_DEVBUF, M_NOWAIT);
198 	if (mem == NULL) {
199 		printf("WARNING: no memory for vnode disks\n");
200 		return;
201 	}
202 	bzero(mem, size);
203 	vnd_softc = (struct vnd_softc *)mem;
204 	numvnd = num;
205 }
206 
207 int
208 vndopen(dev, flags, mode, p)
209 	dev_t dev;
210 	int flags, mode;
211 	struct proc *p;
212 {
213 	int unit = vndunit(dev);
214 	struct vnd_softc *sc;
215 	int error = 0, part, pmask;
216 
217 #ifdef DEBUG
218 	if (vnddebug & VDB_FOLLOW)
219 		printf("vndopen(%x, %x, %x, %p)\n", dev, flags, mode, p);
220 #endif
221 	if (unit >= numvnd)
222 		return (ENXIO);
223 	sc = &vnd_softc[unit];
224 
225 	if ((error = vndlock(sc)) != 0)
226 		return (error);
227 
228 	if ((sc->sc_flags & VNF_INITED) &&
229 	    (sc->sc_flags & VNF_HAVELABEL) == 0) {
230 		sc->sc_flags |= VNF_HAVELABEL;
231 		vndgetdisklabel(dev, sc);
232 	}
233 
234 	part = DISKPART(dev);
235 	pmask = 1 << part;
236 
237 	/*
238 	 * If any partition is open, all succeeding openings must be of the
239 	 * same type.
240 	 */
241 	if (sc->sc_dk.dk_openmask) {
242 		if (((sc->sc_flags & VNF_SIMPLE) != 0) !=
243 		    (vndsimple(dev) != 0)) {
244 			error = EBUSY;
245 			goto bad;
246 		}
247 	} else if (vndsimple(dev))
248 		sc->sc_flags |= VNF_SIMPLE;
249 	else
250 		sc->sc_flags &= ~VNF_SIMPLE;
251 
252 	/* Check that the partition exists. */
253 	if (part != RAW_PART &&
254 	    ((sc->sc_flags & VNF_HAVELABEL) == 0 ||
255 	    part >= sc->sc_dk.dk_label->d_npartitions ||
256 	    sc->sc_dk.dk_label->d_partitions[part].p_fstype == FS_UNUSED)) {
257 		error = ENXIO;
258 		goto bad;
259 	}
260 
261 	/* Prevent our unit from being unconfigured while open. */
262 	switch (mode) {
263 	case S_IFCHR:
264 		sc->sc_dk.dk_copenmask |= pmask;
265 		break;
266 
267 	case S_IFBLK:
268 		sc->sc_dk.dk_bopenmask |= pmask;
269 		break;
270 	}
271 	sc->sc_dk.dk_openmask =
272 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
273 
274 	vndunlock(sc);
275 	return (0);
276 bad:
277 	vndunlock(sc);
278 	return (error);
279 }
280 
281 /*
282  * Load the label information on the named device
283  */
284 void
285 vndgetdisklabel(dev, sc)
286 	dev_t dev;
287 	struct vnd_softc *sc;
288 {
289 	struct disklabel *lp = sc->sc_dk.dk_label;
290 	char *errstring;
291 
292 	bzero(lp, sizeof(struct disklabel));
293 	bzero(sc->sc_dk.dk_cpulabel, sizeof(struct cpu_disklabel));
294 
295 	lp->d_secsize = 512;
296 	lp->d_ntracks = 1;
297 	lp->d_nsectors = 100;
298 	lp->d_ncylinders = sc->sc_size / 100;
299 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
300 	if (lp->d_secpercyl == 0) {
301 		lp->d_secpercyl = 100;
302 		/* as long as it's not 0 - readdisklabel divides by it (?) */
303 	}
304 
305 	strncpy(lp->d_typename, "vnd device", sizeof(lp->d_typename));
306 	lp->d_type = DTYPE_SCSI;
307 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
308 	lp->d_secperunit = sc->sc_size;
309 	lp->d_rpm = 3600;
310 	lp->d_interleave = 1;
311 	lp->d_flags = 0;
312 
313 	lp->d_partitions[RAW_PART].p_offset = 0;
314 	lp->d_partitions[RAW_PART].p_size =
315 	    lp->d_secperunit * (lp->d_secsize / DEV_BSIZE);
316 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
317 	lp->d_npartitions = RAW_PART + 1;
318 
319 	lp->d_magic = DISKMAGIC;
320 	lp->d_magic2 = DISKMAGIC;
321 	lp->d_checksum = dkcksum(lp);
322 
323 	/*
324 	 * Call the generic disklabel extraction routine
325 	 */
326 	errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp,
327 	    sc->sc_dk.dk_cpulabel, 0);
328 	if (errstring) {
329 		/*printf("%s: %s\n", sc->sc_dev.dv_xname, errstring);*/
330 		return;
331 	}
332 }
333 
334 int
335 vndclose(dev, flags, mode, p)
336 	dev_t dev;
337 	int flags, mode;
338 	struct proc *p;
339 {
340 	int unit = vndunit(dev);
341 	struct vnd_softc *sc;
342 	int error = 0, part;
343 
344 #ifdef DEBUG
345 	if (vnddebug & VDB_FOLLOW)
346 		printf("vndclose(%x, %x, %x, %p)\n", dev, flags, mode, p);
347 #endif
348 
349 	if (unit >= numvnd)
350 		return (ENXIO);
351 	sc = &vnd_softc[unit];
352 
353 	if ((error = vndlock(sc)) != 0)
354 		return (error);
355 
356 	part = DISKPART(dev);
357 
358 	/* ...that much closer to allowing unconfiguration... */
359 	switch (mode) {
360 	case S_IFCHR:
361 		sc->sc_dk.dk_copenmask &= ~(1 << part);
362 		break;
363 
364 	case S_IFBLK:
365 		sc->sc_dk.dk_bopenmask &= ~(1 << part);
366 		break;
367 	}
368 	sc->sc_dk.dk_openmask =
369 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
370 
371 	vndunlock(sc);
372 	return (0);
373 }
374 
375 /*
376  * Two methods are used, the traditional buffercache bypassing and the
377  * newer, cache-coherent on unmount, one.
378  *
379  * Former method:
380  * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
381  * Note that this driver can only be used for swapping over NFS on the hp
382  * since nfs_strategy on the vax cannot handle u-areas and page tables.
383  *
384  * Latter method:
385  * Repack the buffer into an uio structure and use VOP_READ/VOP_WRITE to
386  * access the underlying file.  Things are complicated by the fact that we
387  * might get recursively called due to buffer flushes.  In those cases we
388  * queue one write.
389  */
390 void
391 vndstrategy(bp)
392 	struct buf *bp;
393 {
394 	int unit = vndunit(bp->b_dev);
395 	struct vnd_softc *vnd = &vnd_softc[unit];
396 	struct vndbuf *nbp;
397 	int bsize;
398 	off_t bn;
399 	caddr_t addr;
400 	size_t resid;
401 	int sz, flags, error, s;
402 	struct iovec aiov;
403 	struct uio auio;
404 	struct proc *p = curproc;
405 
406 #ifdef DEBUG
407 	if (vnddebug & VDB_FOLLOW)
408 		printf("vndstrategy(%p): unit %d\n", bp, unit);
409 #endif
410 	if ((vnd->sc_flags & VNF_INITED) == 0) {
411 		bp->b_error = ENXIO;
412 		bp->b_flags |= B_ERROR;
413 		s = splbio();
414 		biodone(bp);
415 		splx(s);
416 		return;
417 	}
418 
419 	bn = bp->b_blkno;
420 	sz = howmany(bp->b_bcount, DEV_BSIZE);
421 	bp->b_resid = bp->b_bcount;
422 	if (bn < 0) {
423 		bp->b_error = EINVAL;
424 		bp->b_flags |= B_ERROR;
425 		s = splbio();
426 		biodone(bp);
427 		splx(s);
428 		return;
429 	}
430 	if (DISKPART(bp->b_dev) != RAW_PART &&
431 	    bounds_check_with_label(bp, vnd->sc_dk.dk_label,
432 	    vnd->sc_dk.dk_cpulabel, 1) <= 0) {
433 		s = splbio();
434 		biodone(bp);
435 		splx(s);
436 		return;
437 	}
438 
439 	/* No bypassing of buffer cache?  */
440 	if (vndsimple(bp->b_dev)) {
441 		/*
442 		 * In order to avoid "locking against myself" panics, we
443 		 * must be prepared to queue operations during another I/O
444 		 * operation.  This situation comes up where a dirty cache
445 		 * buffer needs to be flushed in order to provide the current
446 		 * operation with a fresh buffer.
447 		 *
448 		 * XXX do we really need to protect stuff relating to this with
449 		 * splbio?
450 		 */
451 		if (vnd->sc_flags & VNF_BUSY) {
452 			s = splbio();
453 			bp->b_actf = vnd->sc_tab.b_actf;
454 			vnd->sc_tab.b_actf = bp;
455 			vnd->sc_tab.b_active++;
456 			splx(s);
457 			return;
458 		}
459 
460 		/* Loop until all queued requests are handled.  */
461 		for (;;) {
462 			int part = DISKPART(bp->b_dev);
463 			int off = vnd->sc_dk.dk_label->d_partitions[part].p_offset;
464 
465 			aiov.iov_base = bp->b_data;
466 			auio.uio_resid = aiov.iov_len = bp->b_bcount;
467 			auio.uio_iov = &aiov;
468 			auio.uio_iovcnt = 1;
469 			auio.uio_offset = dbtob((off_t)(bp->b_blkno + off));
470 			auio.uio_segflg = UIO_SYSSPACE;
471 			auio.uio_procp = NULL;
472 
473 			vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
474 			vnd->sc_flags |= VNF_BUSY;
475 			if (bp->b_flags & B_READ) {
476 				auio.uio_rw = UIO_READ;
477 				bp->b_error = VOP_READ(vnd->sc_vp, &auio, 0,
478 				    vnd->sc_cred);
479 				if (vnd->sc_keyctx)
480 					vndencrypt(vnd,	bp->b_data,
481 						   bp->b_bcount,
482 						   bp->b_blkno, 0);
483 			} else {
484 				if (vnd->sc_keyctx)
485 					vndencrypt(vnd, bp->b_data,
486 						   bp->b_bcount,
487 						   bp->b_blkno, 1);
488 				auio.uio_rw = UIO_WRITE;
489 				bp->b_error = VOP_WRITE(vnd->sc_vp, &auio, 0,
490 				    vnd->sc_cred);
491 				/* Data in buffer cache needs to be in clear */
492 				if (vnd->sc_keyctx)
493 					vndencrypt(vnd, bp->b_data,
494 						   bp->b_bcount,
495 						   bp->b_blkno, 0);
496 			}
497 			vnd->sc_flags &= ~VNF_BUSY;
498 			VOP_UNLOCK(vnd->sc_vp, 0, p);
499 			if (bp->b_error)
500 				bp->b_flags |= B_ERROR;
501 			bp->b_resid = auio.uio_resid;
502 			s = splbio();
503 			biodone(bp);
504 			splx(s);
505 
506 			/* If nothing more is queued, we are done.  */
507 			if (!vnd->sc_tab.b_active)
508 				return;
509 
510 			/*
511 			 * Dequeue now since lower level strategy
512 			 * routine might queue using same links.
513 			 */
514 			s = splbio();
515 			bp = vnd->sc_tab.b_actf;
516 			vnd->sc_tab.b_actf = bp->b_actf;
517 			vnd->sc_tab.b_active--;
518 			splx(s);
519 		}
520 	}
521 
522 	/* The old-style buffercache bypassing method.  */
523 	bn += vnd->sc_dk.dk_label->d_partitions[DISKPART(bp->b_dev)].p_offset;
524 	bn = dbtob(bn);
525  	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
526 	addr = bp->b_data;
527 	flags = bp->b_flags | B_CALL;
528 	for (resid = bp->b_resid; resid; resid -= sz) {
529 		struct vnode *vp;
530 		daddr_t nbn;
531 		int off, s, nra;
532 
533 		nra = 0;
534 		vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
535 		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
536 		VOP_UNLOCK(vnd->sc_vp, 0, p);
537 		if (error == 0 && (long)nbn == -1)
538 			error = EIO;
539 #ifdef DEBUG
540 		if (!dovndcluster)
541 			nra = 0;
542 #endif
543 
544 		if ((off = bn % bsize) != 0)
545 			sz = bsize - off;
546 		else
547 			sz = (1 + nra) * bsize;
548 		if (resid < sz)
549 			sz = resid;
550 #ifdef DEBUG
551 		if (vnddebug & VDB_IO)
552 			printf("vndstrategy: vp %p/%p bn %x/%x sz %x\n",
553 			       vnd->sc_vp, vp, bn, nbn, sz);
554 #endif
555 
556 		nbp = getvndbuf();
557 		nbp->vb_buf.b_flags = flags;
558 		nbp->vb_buf.b_bcount = sz;
559 		nbp->vb_buf.b_bufsize = bp->b_bufsize;
560 		nbp->vb_buf.b_error = 0;
561 		if (vp->v_type == VBLK || vp->v_type == VCHR)
562 			nbp->vb_buf.b_dev = vp->v_rdev;
563 		else
564 			nbp->vb_buf.b_dev = NODEV;
565 		nbp->vb_buf.b_data = addr;
566 		nbp->vb_buf.b_blkno = nbn + btodb(off);
567 		nbp->vb_buf.b_proc = bp->b_proc;
568 		nbp->vb_buf.b_iodone = vndiodone;
569 		nbp->vb_buf.b_vp = vp;
570 		nbp->vb_buf.b_dirtyoff = bp->b_dirtyoff;
571 		nbp->vb_buf.b_dirtyend = bp->b_dirtyend;
572 		nbp->vb_buf.b_validoff = bp->b_validoff;
573 		nbp->vb_buf.b_validend = bp->b_validend;
574 		LIST_INIT(&nbp->vb_buf.b_dep);
575 
576 		/* save a reference to the old buffer */
577 		nbp->vb_obp = bp;
578 
579 		/*
580 		 * If there was an error or a hole in the file...punt.
581 		 * Note that we deal with this after the nbp allocation.
582 		 * This ensures that we properly clean up any operations
583 		 * that we have already fired off.
584 		 *
585 		 * XXX we could deal with holes here but it would be
586 		 * a hassle (in the write case).
587 		 */
588 		if (error) {
589 			nbp->vb_buf.b_error = error;
590 			nbp->vb_buf.b_flags |= B_ERROR;
591 			bp->b_resid -= (resid - sz);
592 			s = splbio();
593 			biodone(&nbp->vb_buf);
594 			splx(s);
595 			return;
596 		}
597 		/*
598 		 * Just sort by block number
599 		 */
600 		nbp->vb_buf.b_cylin = nbp->vb_buf.b_blkno;
601 		s = splbio();
602 		disksort(&vnd->sc_tab, &nbp->vb_buf);
603 		if (vnd->sc_tab.b_active < vnd->sc_maxactive) {
604 			vnd->sc_tab.b_active++;
605 			vndstart(vnd);
606 		}
607 		splx(s);
608 
609 		bn += sz;
610 		addr += sz;
611 	}
612 }
613 
614 /*
615  * Feed requests sequentially.
616  * We do it this way to keep from flooding NFS servers if we are connected
617  * to an NFS file.  This places the burden on the client rather than the
618  * server.
619  */
620 void
621 vndstart(vnd)
622 	struct vnd_softc *vnd;
623 {
624 	struct buf *bp;
625 
626 	/*
627 	 * Dequeue now since lower level strategy routine might
628 	 * queue using same links
629 	 */
630 	bp = vnd->sc_tab.b_actf;
631 	vnd->sc_tab.b_actf = bp->b_actf;
632 #ifdef DEBUG
633 	if (vnddebug & VDB_IO)
634 		printf("vndstart(%d): bp %p vp %p blkno %x addr %p cnt %lx\n",
635 		    vnd-vnd_softc, bp, bp->b_vp, bp->b_blkno, bp->b_data,
636 		    bp->b_bcount);
637 #endif
638 
639 	/* Instrumentation. */
640 	disk_busy(&vnd->sc_dk);
641 
642 	if ((bp->b_flags & B_READ) == 0)
643 		bp->b_vp->v_numoutput++;
644 	VOP_STRATEGY(bp);
645 }
646 
647 void
648 vndiodone(bp)
649 	struct buf *bp;
650 {
651 	struct vndbuf *vbp = (struct vndbuf *) bp;
652 	struct buf *pbp = vbp->vb_obp;
653 	struct vnd_softc *vnd = &vnd_softc[vndunit(pbp->b_dev)];
654 	long count;
655 
656 	splassert(IPL_BIO);
657 
658 #ifdef DEBUG
659 	if (vnddebug & VDB_IO)
660 		printf("vndiodone(%d): vbp %p vp %p blkno %x addr %p cnt %lx\n",
661 		    vnd-vnd_softc, vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
662 		    vbp->vb_buf.b_data, vbp->vb_buf.b_bcount);
663 #endif
664 
665 	if (vbp->vb_buf.b_error) {
666 #ifdef DEBUG
667 		if (vnddebug & VDB_IO)
668 			printf("vndiodone: vbp %p error %d\n", vbp,
669 			    vbp->vb_buf.b_error);
670 #endif
671 		pbp->b_flags |= B_ERROR;
672 		pbp->b_error = biowait(&vbp->vb_buf);
673 	}
674 	pbp->b_resid -= vbp->vb_buf.b_bcount;
675 	putvndbuf(vbp);
676 	count = pbp->b_bcount - pbp->b_resid;
677 	if (pbp->b_resid == 0) {
678 #ifdef DEBUG
679 		if (vnddebug & VDB_IO)
680 			printf("vndiodone: pbp %p iodone\n", pbp);
681 #endif
682 		biodone(pbp);
683 	}
684 	if (vnd->sc_tab.b_active) {
685 		disk_unbusy(&vnd->sc_dk, count);
686 		if (vnd->sc_tab.b_actf)
687 			vndstart(vnd);
688 		else
689 			vnd->sc_tab.b_active--;
690 	}
691 }
692 
693 /* ARGSUSED */
694 int
695 vndread(dev, uio, flags)
696 	dev_t dev;
697 	struct uio *uio;
698 	int flags;
699 {
700 	int unit = vndunit(dev);
701 	struct vnd_softc *sc;
702 
703 #ifdef DEBUG
704 	if (vnddebug & VDB_FOLLOW)
705 		printf("vndread(%x, %p)\n", dev, uio);
706 #endif
707 
708 	if (unit >= numvnd)
709 		return (ENXIO);
710 	sc = &vnd_softc[unit];
711 
712 	if ((sc->sc_flags & VNF_INITED) == 0)
713 		return (ENXIO);
714 
715 	return (physio(vndstrategy, NULL, dev, B_READ, minphys, uio));
716 }
717 
718 /* ARGSUSED */
719 int
720 vndwrite(dev, uio, flags)
721 	dev_t dev;
722 	struct uio *uio;
723 	int flags;
724 {
725 	int unit = vndunit(dev);
726 	struct vnd_softc *sc;
727 
728 #ifdef DEBUG
729 	if (vnddebug & VDB_FOLLOW)
730 		printf("vndwrite(%x, %p)\n", dev, uio);
731 #endif
732 
733 	if (unit >= numvnd)
734 		return (ENXIO);
735 	sc = &vnd_softc[unit];
736 
737 	if ((sc->sc_flags & VNF_INITED) == 0)
738 		return (ENXIO);
739 
740 	return (physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio));
741 }
742 
743 /* ARGSUSED */
744 int
745 vndioctl(dev, cmd, addr, flag, p)
746 	dev_t dev;
747 	u_long cmd;
748 	caddr_t addr;
749 	int flag;
750 	struct proc *p;
751 {
752 	int unit = vndunit(dev);
753 	struct vnd_softc *vnd;
754 	struct vnd_ioctl *vio;
755 	struct vattr vattr;
756 	struct nameidata nd;
757 	int error, part, pmask, s;
758 
759 #ifdef DEBUG
760 	if (vnddebug & VDB_FOLLOW)
761 		printf("vndioctl(%x, %lx, %p, %x, %p): unit %d\n",
762 		    dev, cmd, addr, flag, p, unit);
763 #endif
764 	error = suser(p, 0);
765 	if (error)
766 		return (error);
767 	if (unit >= numvnd)
768 		return (ENXIO);
769 
770 	vnd = &vnd_softc[unit];
771 	vio = (struct vnd_ioctl *)addr;
772 	switch (cmd) {
773 
774 	case VNDIOCSET:
775 		if (vnd->sc_flags & VNF_INITED)
776 			return (EBUSY);
777 		if (!(vnd->sc_flags & VNF_SIMPLE) && vio->vnd_keylen)
778 			return (EINVAL);
779 
780 		if ((error = vndlock(vnd)) != 0)
781 			return (error);
782 
783 		bzero(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname));
784 		if (snprintf(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname),
785 		    "vnd%d", unit) >= sizeof(vnd->sc_dev.dv_xname)) {
786 			printf("VNDIOCSET: device name too long\n");
787 			vndunlock(vnd);
788 			return(ENXIO);
789 		}
790 
791 		/*
792 		 * Always open for read and write.
793 		 * This is probably bogus, but it lets vn_open()
794 		 * weed out directories, sockets, etc. so we don't
795 		 * have to worry about them.
796 		 */
797 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vnd_file, p);
798 		if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
799 			vndunlock(vnd);
800 			return (error);
801 		}
802 		error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p);
803 		if (error) {
804 			VOP_UNLOCK(nd.ni_vp, 0, p);
805 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
806 			vndunlock(vnd);
807 			return (error);
808 		}
809 		VOP_UNLOCK(nd.ni_vp, 0, p);
810 		vnd->sc_vp = nd.ni_vp;
811 		vnd->sc_size = btodb(vattr.va_size);	/* note truncation */
812 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0) {
813 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
814 			vndunlock(vnd);
815 			return (error);
816 		}
817 
818 		if (vio->vnd_keylen) {
819 			char *key;
820 
821 			key = malloc(vio->vnd_keylen, M_TEMP, M_WAITOK);
822 			if ((error = copyin((caddr_t)vio->vnd_key, key,
823 					    vio->vnd_keylen)) != 0) {
824 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
825 						p->p_ucred, p);
826 				vndunlock(vnd);
827 				return (error);
828 			}
829 
830 			vnd->sc_keyctx = malloc(sizeof(blf_ctx), M_DEVBUF,
831 						M_WAITOK);
832 			blf_key(vnd->sc_keyctx, key, vio->vnd_keylen);
833 			bzero(key, vio->vnd_keylen);
834 			free((caddr_t)key, M_TEMP);
835 		} else
836 			vnd->sc_keyctx = NULL;
837 
838 		vndthrottle(vnd, vnd->sc_vp);
839 		vio->vnd_size = dbtob((off_t)vnd->sc_size);
840 		vnd->sc_flags |= VNF_INITED;
841 #ifdef DEBUG
842 		if (vnddebug & VDB_INIT)
843 			printf("vndioctl: SET vp %p size %x\n",
844 			    vnd->sc_vp, vnd->sc_size);
845 #endif
846 
847 		/* Attach the disk. */
848 		vnd->sc_dk.dk_driver = &vnddkdriver;
849 		vnd->sc_dk.dk_name = vnd->sc_dev.dv_xname;
850 		disk_attach(&vnd->sc_dk);
851 		dk_establish(&vnd->sc_dk, &vnd->sc_dev);
852 
853 		vndunlock(vnd);
854 
855 		break;
856 
857 	case VNDIOCCLR:
858 		if ((vnd->sc_flags & VNF_INITED) == 0)
859 			return (ENXIO);
860 
861 		if ((error = vndlock(vnd)) != 0)
862 			return (error);
863 
864 		/*
865 		 * Don't unconfigure if any other partitions are open
866 		 * or if both the character and block flavors of this
867 		 * partition are open.
868 		 */
869 		part = DISKPART(dev);
870 		pmask = (1 << part);
871 		if ((vnd->sc_dk.dk_openmask & ~pmask) ||
872 		    ((vnd->sc_dk.dk_bopenmask & pmask) &&
873 		    (vnd->sc_dk.dk_copenmask & pmask))) {
874 			vndunlock(vnd);
875 			return (EBUSY);
876 		}
877 
878 		vndclear(vnd);
879 #ifdef DEBUG
880 		if (vnddebug & VDB_INIT)
881 			printf("vndioctl: CLRed\n");
882 #endif
883 		/* Free crypto key */
884 		if (vnd->sc_keyctx) {
885 			bzero(vnd->sc_keyctx, vio->vnd_keylen);
886 			free((caddr_t)vnd->sc_keyctx, M_DEVBUF);
887 		}
888 
889 		/* Detatch the disk. */
890 		disk_detach(&vnd->sc_dk);
891 
892 		/* This must be atomic. */
893 		s = splhigh();
894 		vndunlock(vnd);
895 		bzero(vnd, sizeof(struct vnd_softc));
896 		splx(s);
897 		break;
898 
899 	case DIOCGDINFO:
900 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
901 			return (ENOTTY);
902 		*(struct disklabel *)addr = *(vnd->sc_dk.dk_label);
903 		return (0);
904 
905 	case DIOCGPART:
906 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
907 			return (ENOTTY);
908 		((struct partinfo *)addr)->disklab = vnd->sc_dk.dk_label;
909 		((struct partinfo *)addr)->part =
910 		    &vnd->sc_dk.dk_label->d_partitions[DISKPART(dev)];
911 		return (0);
912 
913 	case DIOCWDINFO:
914 	case DIOCSDINFO:
915 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
916 			return (ENOTTY);
917 		if ((flag & FWRITE) == 0)
918 			return (EBADF);
919 
920 		if ((error = vndlock(vnd)) != 0)
921 			return (error);
922 		vnd->sc_flags |= VNF_LABELLING;
923 
924 		error = setdisklabel(vnd->sc_dk.dk_label,
925 		    (struct disklabel *)addr, /*vnd->sc_dk.dk_openmask : */0,
926 		    vnd->sc_dk.dk_cpulabel);
927 		if (error == 0) {
928 			if (cmd == DIOCWDINFO)
929 				error = writedisklabel(MAKEDISKDEV(major(dev),
930 				    DISKUNIT(dev), RAW_PART),
931 				    vndstrategy, vnd->sc_dk.dk_label,
932 				    vnd->sc_dk.dk_cpulabel);
933 		}
934 
935 		vnd->sc_flags &= ~VNF_LABELLING;
936 		vndunlock(vnd);
937 		return (error);
938 
939 	case DIOCWLABEL:
940 		if ((flag & FWRITE) == 0)
941 			return (EBADF);
942 		if (*(int *)addr)
943 			vnd->sc_flags |= VNF_WLABEL;
944 		else
945 			vnd->sc_flags &= ~VNF_WLABEL;
946 		return (0);
947 
948 	default:
949 		return (ENOTTY);
950 	}
951 
952 	return (0);
953 }
954 
955 /*
956  * Duplicate the current processes' credentials.  Since we are called only
957  * as the result of a SET ioctl and only root can do that, any future access
958  * to this "disk" is essentially as root.  Note that credentials may change
959  * if some other uid can write directly to the mapped file (NFS).
960  */
961 int
962 vndsetcred(vnd, cred)
963 	struct vnd_softc *vnd;
964 	struct ucred *cred;
965 {
966 	struct uio auio;
967 	struct iovec aiov;
968 	char *tmpbuf;
969 	int error;
970 	struct proc *p = curproc;
971 
972 	vnd->sc_cred = crdup(cred);
973 	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
974 
975 	/* XXX: Horrible kludge to establish credentials for NFS */
976 	aiov.iov_base = tmpbuf;
977 	aiov.iov_len = min(DEV_BSIZE, dbtob(vnd->sc_size));
978 	auio.uio_iov = &aiov;
979 	auio.uio_iovcnt = 1;
980 	auio.uio_offset = 0;
981 	auio.uio_rw = UIO_READ;
982 	auio.uio_segflg = UIO_SYSSPACE;
983 	auio.uio_resid = aiov.iov_len;
984 	vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
985 	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
986 	VOP_UNLOCK(vnd->sc_vp, 0, p);
987 
988 	free(tmpbuf, M_TEMP);
989 	return (error);
990 }
991 
992 /*
993  * Set maxactive based on FS type
994  */
995 void
996 vndthrottle(vnd, vp)
997 	struct vnd_softc *vnd;
998 	struct vnode *vp;
999 {
1000 #ifdef NFSCLIENT
1001 	extern int (**nfsv2_vnodeop_p)(void *);
1002 
1003 	if (vp->v_op == nfsv2_vnodeop_p)
1004 		vnd->sc_maxactive = 2;
1005 	else
1006 #endif
1007 		vnd->sc_maxactive = 8;
1008 }
1009 
1010 void
1011 vndshutdown()
1012 {
1013 	struct vnd_softc *vnd;
1014 
1015 	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
1016 		if (vnd->sc_flags & VNF_INITED)
1017 			vndclear(vnd);
1018 }
1019 
1020 void
1021 vndclear(vnd)
1022 	struct vnd_softc *vnd;
1023 {
1024 	struct vnode *vp = vnd->sc_vp;
1025 	struct proc *p = curproc;		/* XXX */
1026 
1027 #ifdef DEBUG
1028 	if (vnddebug & VDB_FOLLOW)
1029 		printf("vndclear(%p): vp %p\n", vnd, vp);
1030 #endif
1031 	vnd->sc_flags &= ~VNF_INITED;
1032 	if (vp == (struct vnode *)0)
1033 		panic("vndioctl: null vp");
1034 	(void) vn_close(vp, FREAD|FWRITE, vnd->sc_cred, p);
1035 	crfree(vnd->sc_cred);
1036 	vnd->sc_vp = (struct vnode *)0;
1037 	vnd->sc_cred = (struct ucred *)0;
1038 	vnd->sc_size = 0;
1039 }
1040 
1041 int
1042 vndsize(dev)
1043 	dev_t dev;
1044 {
1045 	int unit = vndunit(dev);
1046 	struct vnd_softc *vnd = &vnd_softc[unit];
1047 
1048 	if (unit >= numvnd || (vnd->sc_flags & VNF_INITED) == 0)
1049 		return (-1);
1050 	return (vnd->sc_size);
1051 }
1052 
1053 int
1054 vnddump(dev, blkno, va, size)
1055 	dev_t dev;
1056 	daddr_t blkno;
1057 	caddr_t va;
1058 	size_t size;
1059 {
1060 
1061 	/* Not implemented. */
1062 	return (ENXIO);
1063 }
1064 
1065 /*
1066  * Wait interruptibly for an exclusive lock.
1067  *
1068  * XXX
1069  * Several drivers do this; it should be abstracted and made MP-safe.
1070  */
1071 int
1072 vndlock(sc)
1073 	struct vnd_softc *sc;
1074 {
1075 	int error;
1076 
1077 	while ((sc->sc_flags & VNF_LOCKED) != 0) {
1078 		sc->sc_flags |= VNF_WANTED;
1079 		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
1080 			return (error);
1081 	}
1082 	sc->sc_flags |= VNF_LOCKED;
1083 	return (0);
1084 }
1085 
1086 /*
1087  * Unlock and wake up any waiters.
1088  */
1089 void
1090 vndunlock(sc)
1091 	struct vnd_softc *sc;
1092 {
1093 
1094 	sc->sc_flags &= ~VNF_LOCKED;
1095 	if ((sc->sc_flags & VNF_WANTED) != 0) {
1096 		sc->sc_flags &= ~VNF_WANTED;
1097 		wakeup(sc);
1098 	}
1099 }
1100