xref: /openbsd-src/sys/dev/vnd.c (revision db3296cf5c1dd9058ceecc3a29fe4aaa0bd26000)
1 /*	$OpenBSD: vnd.c,v 1.38 2003/06/02 23:28:01 millert Exp $	*/
2 /*	$NetBSD: vnd.c,v 1.26 1996/03/30 23:06:11 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1988 University of Utah.
6  * Copyright (c) 1990, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * from: Utah $Hdr: vn.c 1.13 94/04/02$
38  *
39  *	@(#)vn.c	8.6 (Berkeley) 4/1/94
40  */
41 
42 /*
43  * Vnode disk driver.
44  *
45  * Block/character interface to a vnode.  Allows one to treat a file
46  * as a disk (e.g. build a filesystem in it, mount it, etc.).
47  *
48  * NOTE 1: This uses either the VOP_BMAP/VOP_STRATEGY interface to the
49  * vnode or simple VOP_READ/VOP_WRITE.  The former is suitable for swapping
50  * as it doesn't distort the local buffer cache.  The latter is good for
51  * building disk images as it keeps the cache consistent after the block
52  * device is closed.
53  *
54  * NOTE 2: There is a security issue involved with this driver.
55  * Once mounted all access to the contents of the "mapped" file via
56  * the special file is controlled by the permissions on the special
57  * file, the protection of the mapped file is ignored (effectively,
58  * by using root credentials in all transactions).
59  *
60  * NOTE 3: Doesn't interact with leases, should it?
61  */
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/namei.h>
66 #include <sys/proc.h>
67 #include <sys/errno.h>
68 #include <sys/buf.h>
69 #include <sys/malloc.h>
70 #include <sys/ioctl.h>
71 #include <sys/disklabel.h>
72 #include <sys/device.h>
73 #include <sys/disk.h>
74 #include <sys/stat.h>
75 #include <sys/mount.h>
76 #include <sys/vnode.h>
77 #include <sys/file.h>
78 #include <sys/uio.h>
79 #include <sys/conf.h>
80 
81 #include <crypto/blf.h>
82 
83 #include <miscfs/specfs/specdev.h>
84 
85 #include <dev/vndioctl.h>
86 
87 #ifdef DEBUG
88 int dovndcluster = 1;
89 int vnddebug = 0x00;
90 #define VDB_FOLLOW	0x01
91 #define VDB_INIT	0x02
92 #define VDB_IO		0x04
93 #endif
94 
95 #define b_cylin	b_resid
96 
97 #define	vndunit(x)	DISKUNIT((x) & 0x7f)
98 #define vndsimple(x)	((x) & 0x80)
99 #define	MAKEVNDDEV(maj, unit, part)	MAKEDISKDEV(maj, unit, part)
100 
101 #define	VNDLABELDEV(dev) (MAKEVNDDEV(major(dev), vndunit(dev), RAW_PART))
102 
103 struct vndbuf {
104 	struct buf	vb_buf;
105 	struct buf	*vb_obp;
106 };
107 
108 #define	getvndbuf()	\
109 	((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
110 #define putvndbuf(vbp)	\
111 	free((caddr_t)(vbp), M_DEVBUF)
112 
113 struct vnd_softc {
114 	struct device	 sc_dev;
115 	struct disk	 sc_dk;
116 
117 	int		 sc_flags;	/* flags */
118 	size_t		 sc_size;	/* size of vnd in blocks */
119 	struct vnode	*sc_vp;		/* vnode */
120 	struct ucred	*sc_cred;	/* credentials */
121 	int		 sc_maxactive;	/* max # of active requests */
122 	struct buf	 sc_tab;	/* transfer queue */
123 	void		*sc_keyctx;	/* key context */
124 };
125 
126 /* sc_flags */
127 #define	VNF_ALIVE	0x0001
128 #define VNF_INITED	0x0002
129 #define VNF_WANTED	0x0040
130 #define VNF_LOCKED	0x0080
131 #define	VNF_LABELLING	0x0100
132 #define	VNF_WLABEL	0x0200
133 #define	VNF_HAVELABEL	0x0400
134 #define VNF_BUSY	0x0800
135 #define VNF_SIMPLE	0x1000
136 
137 struct vnd_softc *vnd_softc;
138 int numvnd = 0;
139 
140 struct dkdriver vnddkdriver = { vndstrategy };
141 
142 /* called by main() at boot time */
143 void	vndattach(int);
144 
145 void	vndclear(struct vnd_softc *);
146 void	vndstart(struct vnd_softc *);
147 int	vndsetcred(struct vnd_softc *, struct ucred *);
148 void	vndthrottle(struct vnd_softc *, struct vnode *);
149 void	vndiodone(struct buf *);
150 void	vndshutdown(void);
151 void	vndgetdisklabel(dev_t, struct vnd_softc *);
152 void	vndencrypt(struct vnd_softc *, caddr_t, size_t, daddr_t, int);
153 
154 int	vndlock(struct vnd_softc *);
155 void	vndunlock(struct vnd_softc *);
156 
157 void
158 vndencrypt(vnd, addr, size, off, encrypt)
159      struct vnd_softc *vnd;
160      caddr_t addr;
161      size_t size;
162      daddr_t off;
163      int encrypt;
164 {
165 	int i, bsize;
166 	u_char iv[8];
167 
168 	bsize = dbtob(1);
169 	for (i = 0; i < size/bsize; i++) {
170 		bzero(iv, sizeof(iv));
171 		bcopy((u_char *)&off, iv, sizeof(off));
172 		blf_ecb_encrypt(vnd->sc_keyctx, iv, sizeof(iv));
173 		if (encrypt)
174 			blf_cbc_encrypt(vnd->sc_keyctx, iv, addr, bsize);
175 		else
176 			blf_cbc_decrypt(vnd->sc_keyctx, iv, addr, bsize);
177 
178 		addr += bsize;
179 		off++;
180 	}
181 }
182 
183 void
184 vndattach(num)
185 	int num;
186 {
187 	char *mem;
188 	u_long size;
189 
190 	if (num <= 0)
191 		return;
192 	size = num * sizeof(struct vnd_softc);
193 	mem = malloc(size, M_DEVBUF, M_NOWAIT);
194 	if (mem == NULL) {
195 		printf("WARNING: no memory for vnode disks\n");
196 		return;
197 	}
198 	bzero(mem, size);
199 	vnd_softc = (struct vnd_softc *)mem;
200 	numvnd = num;
201 }
202 
203 int
204 vndopen(dev, flags, mode, p)
205 	dev_t dev;
206 	int flags, mode;
207 	struct proc *p;
208 {
209 	int unit = vndunit(dev);
210 	struct vnd_softc *sc;
211 	int error = 0, part, pmask;
212 
213 #ifdef DEBUG
214 	if (vnddebug & VDB_FOLLOW)
215 		printf("vndopen(%x, %x, %x, %p)\n", dev, flags, mode, p);
216 #endif
217 	if (unit >= numvnd)
218 		return (ENXIO);
219 	sc = &vnd_softc[unit];
220 
221 	if ((error = vndlock(sc)) != 0)
222 		return (error);
223 
224 	if ((sc->sc_flags & VNF_INITED) &&
225 	    (sc->sc_flags & VNF_HAVELABEL) == 0) {
226 		sc->sc_flags |= VNF_HAVELABEL;
227 		vndgetdisklabel(dev, sc);
228 	}
229 
230 	part = DISKPART(dev);
231 	pmask = 1 << part;
232 
233 	/*
234 	 * If any partition is open, all succeeding openings must be of the
235 	 * same type.
236 	 */
237 	if (sc->sc_dk.dk_openmask) {
238 		if (((sc->sc_flags & VNF_SIMPLE) != 0) !=
239 		    (vndsimple(dev) != 0)) {
240 			error = EBUSY;
241 			goto bad;
242 		}
243 	} else if (vndsimple(dev))
244 		sc->sc_flags |= VNF_SIMPLE;
245 	else
246 		sc->sc_flags &= ~VNF_SIMPLE;
247 
248 	/* Check that the partition exists. */
249 	if (part != RAW_PART &&
250 	    ((sc->sc_flags & VNF_HAVELABEL) == 0 ||
251 	    part >= sc->sc_dk.dk_label->d_npartitions ||
252 	    sc->sc_dk.dk_label->d_partitions[part].p_fstype == FS_UNUSED)) {
253 		error = ENXIO;
254 		goto bad;
255 	}
256 
257 	/* Prevent our unit from being unconfigured while open. */
258 	switch (mode) {
259 	case S_IFCHR:
260 		sc->sc_dk.dk_copenmask |= pmask;
261 		break;
262 
263 	case S_IFBLK:
264 		sc->sc_dk.dk_bopenmask |= pmask;
265 		break;
266 	}
267 	sc->sc_dk.dk_openmask =
268 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
269 
270 	vndunlock(sc);
271 	return (0);
272 bad:
273 	vndunlock(sc);
274 	return (error);
275 }
276 
277 /*
278  * Load the label information on the named device
279  */
280 void
281 vndgetdisklabel(dev, sc)
282 	dev_t dev;
283 	struct vnd_softc *sc;
284 {
285 	struct disklabel *lp = sc->sc_dk.dk_label;
286 	char *errstring;
287 
288 	bzero(lp, sizeof(struct disklabel));
289 	bzero(sc->sc_dk.dk_cpulabel, sizeof(struct cpu_disklabel));
290 
291 	lp->d_secsize = 512;
292 	lp->d_ntracks = 1;
293 	lp->d_nsectors = 100;
294 	lp->d_ncylinders = sc->sc_size / 100;
295 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
296 	if (lp->d_secpercyl == 0) {
297 		lp->d_secpercyl = 100;
298 		/* as long as it's not 0 - readdisklabel divides by it (?) */
299 	}
300 
301 	strncpy(lp->d_typename, "vnd device", sizeof(lp->d_typename));
302 	lp->d_type = DTYPE_SCSI;
303 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
304 	lp->d_secperunit = sc->sc_size;
305 	lp->d_rpm = 3600;
306 	lp->d_interleave = 1;
307 	lp->d_flags = 0;
308 
309 	lp->d_partitions[RAW_PART].p_offset = 0;
310 	lp->d_partitions[RAW_PART].p_size =
311 	    lp->d_secperunit * (lp->d_secsize / DEV_BSIZE);
312 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
313 	lp->d_npartitions = RAW_PART + 1;
314 
315 	lp->d_magic = DISKMAGIC;
316 	lp->d_magic2 = DISKMAGIC;
317 	lp->d_checksum = dkcksum(lp);
318 
319 	/*
320 	 * Call the generic disklabel extraction routine
321 	 */
322 	errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp,
323 	    sc->sc_dk.dk_cpulabel, 0);
324 	if (errstring) {
325 		/*printf("%s: %s\n", sc->sc_dev.dv_xname, errstring);*/
326 		return;
327 	}
328 }
329 
330 int
331 vndclose(dev, flags, mode, p)
332 	dev_t dev;
333 	int flags, mode;
334 	struct proc *p;
335 {
336 	int unit = vndunit(dev);
337 	struct vnd_softc *sc;
338 	int error = 0, part;
339 
340 #ifdef DEBUG
341 	if (vnddebug & VDB_FOLLOW)
342 		printf("vndclose(%x, %x, %x, %p)\n", dev, flags, mode, p);
343 #endif
344 
345 	if (unit >= numvnd)
346 		return (ENXIO);
347 	sc = &vnd_softc[unit];
348 
349 	if ((error = vndlock(sc)) != 0)
350 		return (error);
351 
352 	part = DISKPART(dev);
353 
354 	/* ...that much closer to allowing unconfiguration... */
355 	switch (mode) {
356 	case S_IFCHR:
357 		sc->sc_dk.dk_copenmask &= ~(1 << part);
358 		break;
359 
360 	case S_IFBLK:
361 		sc->sc_dk.dk_bopenmask &= ~(1 << part);
362 		break;
363 	}
364 	sc->sc_dk.dk_openmask =
365 	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
366 
367 	vndunlock(sc);
368 	return (0);
369 }
370 
371 /*
372  * Two methods are used, the traditional buffercache bypassing and the
373  * newer, cache-coherent on unmount, one.
374  *
375  * Former method:
376  * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
377  * Note that this driver can only be used for swapping over NFS on the hp
378  * since nfs_strategy on the vax cannot handle u-areas and page tables.
379  *
380  * Latter method:
381  * Repack the buffer into an uio structure and use VOP_READ/VOP_WRITE to
382  * access the underlying file.  Things are complicated by the fact that we
383  * might get recursively called due to buffer flushes.  In those cases we
384  * queue one write.
385  */
386 void
387 vndstrategy(bp)
388 	struct buf *bp;
389 {
390 	int unit = vndunit(bp->b_dev);
391 	struct vnd_softc *vnd = &vnd_softc[unit];
392 	struct vndbuf *nbp;
393 	int bsize;
394 	off_t bn;
395 	caddr_t addr;
396 	size_t resid;
397 	int sz, flags, error, s;
398 	struct iovec aiov;
399 	struct uio auio;
400 	struct proc *p = curproc;
401 
402 #ifdef DEBUG
403 	if (vnddebug & VDB_FOLLOW)
404 		printf("vndstrategy(%p): unit %d\n", bp, unit);
405 #endif
406 	if ((vnd->sc_flags & VNF_INITED) == 0) {
407 		bp->b_error = ENXIO;
408 		bp->b_flags |= B_ERROR;
409 		s = splbio();
410 		biodone(bp);
411 		splx(s);
412 		return;
413 	}
414 
415 	bn = bp->b_blkno;
416 	sz = howmany(bp->b_bcount, DEV_BSIZE);
417 	bp->b_resid = bp->b_bcount;
418 	if (bn < 0) {
419 		bp->b_error = EINVAL;
420 		bp->b_flags |= B_ERROR;
421 		s = splbio();
422 		biodone(bp);
423 		splx(s);
424 		return;
425 	}
426 	if (DISKPART(bp->b_dev) != RAW_PART &&
427 	    bounds_check_with_label(bp, vnd->sc_dk.dk_label,
428 	    vnd->sc_dk.dk_cpulabel, 1) <= 0) {
429 		s = splbio();
430 		biodone(bp);
431 		splx(s);
432 		return;
433 	}
434 
435 	/* No bypassing of buffer cache?  */
436 	if (vndsimple(bp->b_dev)) {
437 		/*
438 		 * In order to avoid "locking against myself" panics, we
439 		 * must be prepared to queue operations during another I/O
440 		 * operation.  This situation comes up where a dirty cache
441 		 * buffer needs to be flushed in order to provide the current
442 		 * operation with a fresh buffer.
443 		 *
444 		 * XXX do we really need to protect stuff relating to this with
445 		 * splbio?
446 		 */
447 		if (vnd->sc_flags & VNF_BUSY) {
448 			s = splbio();
449 			bp->b_actf = vnd->sc_tab.b_actf;
450 			vnd->sc_tab.b_actf = bp;
451 			vnd->sc_tab.b_active++;
452 			splx(s);
453 			return;
454 		}
455 
456 		/* Loop until all queued requests are handled.  */
457 		for (;;) {
458 			int part = DISKPART(bp->b_dev);
459 			int off = vnd->sc_dk.dk_label->d_partitions[part].p_offset;
460 
461 			aiov.iov_base = bp->b_data;
462 			auio.uio_resid = aiov.iov_len = bp->b_bcount;
463 			auio.uio_iov = &aiov;
464 			auio.uio_iovcnt = 1;
465 			auio.uio_offset = dbtob((off_t)(bp->b_blkno + off));
466 			auio.uio_segflg = UIO_SYSSPACE;
467 			auio.uio_procp = NULL;
468 
469 			vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
470 			vnd->sc_flags |= VNF_BUSY;
471 			if (bp->b_flags & B_READ) {
472 				auio.uio_rw = UIO_READ;
473 				bp->b_error = VOP_READ(vnd->sc_vp, &auio, 0,
474 				    vnd->sc_cred);
475 				if (vnd->sc_keyctx)
476 					vndencrypt(vnd,	bp->b_data,
477 						   bp->b_bcount,
478 						   bp->b_blkno, 0);
479 			} else {
480 				if (vnd->sc_keyctx)
481 					vndencrypt(vnd, bp->b_data,
482 						   bp->b_bcount,
483 						   bp->b_blkno, 1);
484 				auio.uio_rw = UIO_WRITE;
485 				bp->b_error = VOP_WRITE(vnd->sc_vp, &auio, 0,
486 				    vnd->sc_cred);
487 				/* Data in buffer cache needs to be in clear */
488 				if (vnd->sc_keyctx)
489 					vndencrypt(vnd, bp->b_data,
490 						   bp->b_bcount,
491 						   bp->b_blkno, 0);
492 			}
493 			vnd->sc_flags &= ~VNF_BUSY;
494 			VOP_UNLOCK(vnd->sc_vp, 0, p);
495 			if (bp->b_error)
496 				bp->b_flags |= B_ERROR;
497 			bp->b_resid = auio.uio_resid;
498 			s = splbio();
499 			biodone(bp);
500 			splx(s);
501 
502 			/* If nothing more is queued, we are done.  */
503 			if (!vnd->sc_tab.b_active)
504 				return;
505 
506 			/*
507 			 * Dequeue now since lower level strategy
508 			 * routine might queue using same links.
509 			 */
510 			s = splbio();
511 			bp = vnd->sc_tab.b_actf;
512 			vnd->sc_tab.b_actf = bp->b_actf;
513 			vnd->sc_tab.b_active--;
514 			splx(s);
515 		}
516 	}
517 
518 	/* The old-style buffercache bypassing method.  */
519 	bn += vnd->sc_dk.dk_label->d_partitions[DISKPART(bp->b_dev)].p_offset;
520 	bn = dbtob(bn);
521  	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
522 	addr = bp->b_data;
523 	flags = bp->b_flags | B_CALL;
524 	for (resid = bp->b_resid; resid; resid -= sz) {
525 		struct vnode *vp;
526 		daddr_t nbn;
527 		int off, s, nra;
528 
529 		nra = 0;
530 		vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
531 		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
532 		VOP_UNLOCK(vnd->sc_vp, 0, p);
533 		if (error == 0 && (long)nbn == -1)
534 			error = EIO;
535 #ifdef DEBUG
536 		if (!dovndcluster)
537 			nra = 0;
538 #endif
539 
540 		if ((off = bn % bsize) != 0)
541 			sz = bsize - off;
542 		else
543 			sz = (1 + nra) * bsize;
544 		if (resid < sz)
545 			sz = resid;
546 #ifdef DEBUG
547 		if (vnddebug & VDB_IO)
548 			printf("vndstrategy: vp %p/%p bn %x/%x sz %x\n",
549 			       vnd->sc_vp, vp, bn, nbn, sz);
550 #endif
551 
552 		nbp = getvndbuf();
553 		nbp->vb_buf.b_flags = flags;
554 		nbp->vb_buf.b_bcount = sz;
555 		nbp->vb_buf.b_bufsize = bp->b_bufsize;
556 		nbp->vb_buf.b_error = 0;
557 		if (vp->v_type == VBLK || vp->v_type == VCHR)
558 			nbp->vb_buf.b_dev = vp->v_rdev;
559 		else
560 			nbp->vb_buf.b_dev = NODEV;
561 		nbp->vb_buf.b_data = addr;
562 		nbp->vb_buf.b_blkno = nbn + btodb(off);
563 		nbp->vb_buf.b_proc = bp->b_proc;
564 		nbp->vb_buf.b_iodone = vndiodone;
565 		nbp->vb_buf.b_vp = vp;
566 		nbp->vb_buf.b_dirtyoff = bp->b_dirtyoff;
567 		nbp->vb_buf.b_dirtyend = bp->b_dirtyend;
568 		nbp->vb_buf.b_validoff = bp->b_validoff;
569 		nbp->vb_buf.b_validend = bp->b_validend;
570 		LIST_INIT(&nbp->vb_buf.b_dep);
571 
572 		/* save a reference to the old buffer */
573 		nbp->vb_obp = bp;
574 
575 		/*
576 		 * If there was an error or a hole in the file...punt.
577 		 * Note that we deal with this after the nbp allocation.
578 		 * This ensures that we properly clean up any operations
579 		 * that we have already fired off.
580 		 *
581 		 * XXX we could deal with holes here but it would be
582 		 * a hassle (in the write case).
583 		 */
584 		if (error) {
585 			nbp->vb_buf.b_error = error;
586 			nbp->vb_buf.b_flags |= B_ERROR;
587 			bp->b_resid -= (resid - sz);
588 			s = splbio();
589 			biodone(&nbp->vb_buf);
590 			splx(s);
591 			return;
592 		}
593 		/*
594 		 * Just sort by block number
595 		 */
596 		nbp->vb_buf.b_cylin = nbp->vb_buf.b_blkno;
597 		s = splbio();
598 		disksort(&vnd->sc_tab, &nbp->vb_buf);
599 		if (vnd->sc_tab.b_active < vnd->sc_maxactive) {
600 			vnd->sc_tab.b_active++;
601 			vndstart(vnd);
602 		}
603 		splx(s);
604 
605 		bn += sz;
606 		addr += sz;
607 	}
608 }
609 
610 /*
611  * Feed requests sequentially.
612  * We do it this way to keep from flooding NFS servers if we are connected
613  * to an NFS file.  This places the burden on the client rather than the
614  * server.
615  */
616 void
617 vndstart(vnd)
618 	struct vnd_softc *vnd;
619 {
620 	struct buf *bp;
621 
622 	/*
623 	 * Dequeue now since lower level strategy routine might
624 	 * queue using same links
625 	 */
626 	bp = vnd->sc_tab.b_actf;
627 	vnd->sc_tab.b_actf = bp->b_actf;
628 #ifdef DEBUG
629 	if (vnddebug & VDB_IO)
630 		printf("vndstart(%d): bp %p vp %p blkno %x addr %p cnt %lx\n",
631 		    vnd-vnd_softc, bp, bp->b_vp, bp->b_blkno, bp->b_data,
632 		    bp->b_bcount);
633 #endif
634 
635 	/* Instrumentation. */
636 	disk_busy(&vnd->sc_dk);
637 
638 	if ((bp->b_flags & B_READ) == 0)
639 		bp->b_vp->v_numoutput++;
640 	VOP_STRATEGY(bp);
641 }
642 
643 void
644 vndiodone(bp)
645 	struct buf *bp;
646 {
647 	struct vndbuf *vbp = (struct vndbuf *) bp;
648 	struct buf *pbp = vbp->vb_obp;
649 	struct vnd_softc *vnd = &vnd_softc[vndunit(pbp->b_dev)];
650 	long count;
651 
652 	splassert(IPL_BIO);
653 
654 #ifdef DEBUG
655 	if (vnddebug & VDB_IO)
656 		printf("vndiodone(%d): vbp %p vp %p blkno %x addr %p cnt %lx\n",
657 		    vnd-vnd_softc, vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
658 		    vbp->vb_buf.b_data, vbp->vb_buf.b_bcount);
659 #endif
660 
661 	if (vbp->vb_buf.b_error) {
662 #ifdef DEBUG
663 		if (vnddebug & VDB_IO)
664 			printf("vndiodone: vbp %p error %d\n", vbp,
665 			    vbp->vb_buf.b_error);
666 #endif
667 		pbp->b_flags |= B_ERROR;
668 		pbp->b_error = biowait(&vbp->vb_buf);
669 	}
670 	pbp->b_resid -= vbp->vb_buf.b_bcount;
671 	putvndbuf(vbp);
672 	count = pbp->b_bcount - pbp->b_resid;
673 	if (pbp->b_resid == 0) {
674 #ifdef DEBUG
675 		if (vnddebug & VDB_IO)
676 			printf("vndiodone: pbp %p iodone\n", pbp);
677 #endif
678 		biodone(pbp);
679 	}
680 	if (vnd->sc_tab.b_active) {
681 		disk_unbusy(&vnd->sc_dk, count);
682 		if (vnd->sc_tab.b_actf)
683 			vndstart(vnd);
684 		else
685 			vnd->sc_tab.b_active--;
686 	}
687 }
688 
689 /* ARGSUSED */
690 int
691 vndread(dev, uio, flags)
692 	dev_t dev;
693 	struct uio *uio;
694 	int flags;
695 {
696 	int unit = vndunit(dev);
697 	struct vnd_softc *sc;
698 
699 #ifdef DEBUG
700 	if (vnddebug & VDB_FOLLOW)
701 		printf("vndread(%x, %p)\n", dev, uio);
702 #endif
703 
704 	if (unit >= numvnd)
705 		return (ENXIO);
706 	sc = &vnd_softc[unit];
707 
708 	if ((sc->sc_flags & VNF_INITED) == 0)
709 		return (ENXIO);
710 
711 	return (physio(vndstrategy, NULL, dev, B_READ, minphys, uio));
712 }
713 
714 /* ARGSUSED */
715 int
716 vndwrite(dev, uio, flags)
717 	dev_t dev;
718 	struct uio *uio;
719 	int flags;
720 {
721 	int unit = vndunit(dev);
722 	struct vnd_softc *sc;
723 
724 #ifdef DEBUG
725 	if (vnddebug & VDB_FOLLOW)
726 		printf("vndwrite(%x, %p)\n", dev, uio);
727 #endif
728 
729 	if (unit >= numvnd)
730 		return (ENXIO);
731 	sc = &vnd_softc[unit];
732 
733 	if ((sc->sc_flags & VNF_INITED) == 0)
734 		return (ENXIO);
735 
736 	return (physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio));
737 }
738 
739 /* ARGSUSED */
740 int
741 vndioctl(dev, cmd, addr, flag, p)
742 	dev_t dev;
743 	u_long cmd;
744 	caddr_t addr;
745 	int flag;
746 	struct proc *p;
747 {
748 	int unit = vndunit(dev);
749 	struct vnd_softc *vnd;
750 	struct vnd_ioctl *vio;
751 	struct vattr vattr;
752 	struct nameidata nd;
753 	int error, part, pmask, s;
754 
755 #ifdef DEBUG
756 	if (vnddebug & VDB_FOLLOW)
757 		printf("vndioctl(%x, %lx, %p, %x, %p): unit %d\n",
758 		    dev, cmd, addr, flag, p, unit);
759 #endif
760 	error = suser(p->p_ucred, &p->p_acflag);
761 	if (error)
762 		return (error);
763 	if (unit >= numvnd)
764 		return (ENXIO);
765 
766 	vnd = &vnd_softc[unit];
767 	vio = (struct vnd_ioctl *)addr;
768 	switch (cmd) {
769 
770 	case VNDIOCSET:
771 		if (vnd->sc_flags & VNF_INITED)
772 			return (EBUSY);
773 		if (!(vnd->sc_flags & VNF_SIMPLE) && vio->vnd_keylen)
774 			return (EINVAL);
775 
776 		if ((error = vndlock(vnd)) != 0)
777 			return (error);
778 
779 		bzero(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname));
780 		if (snprintf(vnd->sc_dev.dv_xname, sizeof(vnd->sc_dev.dv_xname),
781 		    "vnd%d", unit) >= sizeof(vnd->sc_dev.dv_xname)) {
782 			printf("VNDIOCSET: device name too long\n");
783 			vndunlock(vnd);
784 			return(ENXIO);
785 		}
786 
787 		/*
788 		 * Always open for read and write.
789 		 * This is probably bogus, but it lets vn_open()
790 		 * weed out directories, sockets, etc. so we don't
791 		 * have to worry about them.
792 		 */
793 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vnd_file, p);
794 		if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
795 			vndunlock(vnd);
796 			return (error);
797 		}
798 		error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p);
799 		if (error) {
800 			VOP_UNLOCK(nd.ni_vp, 0, p);
801 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
802 			vndunlock(vnd);
803 			return (error);
804 		}
805 		VOP_UNLOCK(nd.ni_vp, 0, p);
806 		vnd->sc_vp = nd.ni_vp;
807 		vnd->sc_size = btodb(vattr.va_size);	/* note truncation */
808 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0) {
809 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
810 			vndunlock(vnd);
811 			return (error);
812 		}
813 
814 		if (vio->vnd_keylen) {
815 			char *key;
816 
817 			key = malloc(vio->vnd_keylen, M_TEMP, M_WAITOK);
818 			if ((error = copyin((caddr_t)vio->vnd_key, key,
819 					    vio->vnd_keylen)) != 0) {
820 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
821 						p->p_ucred, p);
822 				vndunlock(vnd);
823 				return (error);
824 			}
825 
826 			vnd->sc_keyctx = malloc(sizeof(blf_ctx), M_DEVBUF,
827 						M_WAITOK);
828 			blf_key(vnd->sc_keyctx, key, vio->vnd_keylen);
829 			bzero(key, vio->vnd_keylen);
830 			free((caddr_t)key, M_TEMP);
831 		} else
832 			vnd->sc_keyctx = NULL;
833 
834 		vndthrottle(vnd, vnd->sc_vp);
835 		vio->vnd_size = dbtob((off_t)vnd->sc_size);
836 		vnd->sc_flags |= VNF_INITED;
837 #ifdef DEBUG
838 		if (vnddebug & VDB_INIT)
839 			printf("vndioctl: SET vp %p size %x\n",
840 			    vnd->sc_vp, vnd->sc_size);
841 #endif
842 
843 		/* Attach the disk. */
844 		vnd->sc_dk.dk_driver = &vnddkdriver;
845 		vnd->sc_dk.dk_name = vnd->sc_dev.dv_xname;
846 		disk_attach(&vnd->sc_dk);
847 		dk_establish(&vnd->sc_dk, &vnd->sc_dev);
848 
849 		vndunlock(vnd);
850 
851 		break;
852 
853 	case VNDIOCCLR:
854 		if ((vnd->sc_flags & VNF_INITED) == 0)
855 			return (ENXIO);
856 
857 		if ((error = vndlock(vnd)) != 0)
858 			return (error);
859 
860 		/*
861 		 * Don't unconfigure if any other partitions are open
862 		 * or if both the character and block flavors of this
863 		 * partition are open.
864 		 */
865 		part = DISKPART(dev);
866 		pmask = (1 << part);
867 		if ((vnd->sc_dk.dk_openmask & ~pmask) ||
868 		    ((vnd->sc_dk.dk_bopenmask & pmask) &&
869 		    (vnd->sc_dk.dk_copenmask & pmask))) {
870 			vndunlock(vnd);
871 			return (EBUSY);
872 		}
873 
874 		vndclear(vnd);
875 #ifdef DEBUG
876 		if (vnddebug & VDB_INIT)
877 			printf("vndioctl: CLRed\n");
878 #endif
879 		/* Free crypto key */
880 		if (vnd->sc_keyctx) {
881 			bzero(vnd->sc_keyctx, vio->vnd_keylen);
882 			free((caddr_t)vnd->sc_keyctx, M_DEVBUF);
883 		}
884 
885 		/* Detatch the disk. */
886 		disk_detach(&vnd->sc_dk);
887 
888 		/* This must be atomic. */
889 		s = splhigh();
890 		vndunlock(vnd);
891 		bzero(vnd, sizeof(struct vnd_softc));
892 		splx(s);
893 		break;
894 
895 	case DIOCGDINFO:
896 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
897 			return (ENOTTY);
898 		*(struct disklabel *)addr = *(vnd->sc_dk.dk_label);
899 		return (0);
900 
901 	case DIOCGPART:
902 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
903 			return (ENOTTY);
904 		((struct partinfo *)addr)->disklab = vnd->sc_dk.dk_label;
905 		((struct partinfo *)addr)->part =
906 		    &vnd->sc_dk.dk_label->d_partitions[DISKPART(dev)];
907 		return (0);
908 
909 	case DIOCWDINFO:
910 	case DIOCSDINFO:
911 		if ((vnd->sc_flags & VNF_HAVELABEL) == 0)
912 			return (ENOTTY);
913 		if ((flag & FWRITE) == 0)
914 			return (EBADF);
915 
916 		if ((error = vndlock(vnd)) != 0)
917 			return (error);
918 		vnd->sc_flags |= VNF_LABELLING;
919 
920 		error = setdisklabel(vnd->sc_dk.dk_label,
921 		    (struct disklabel *)addr, /*vnd->sc_dk.dk_openmask : */0,
922 		    vnd->sc_dk.dk_cpulabel);
923 		if (error == 0) {
924 			if (cmd == DIOCWDINFO)
925 				error = writedisklabel(MAKEDISKDEV(major(dev),
926 				    DISKUNIT(dev), RAW_PART),
927 				    vndstrategy, vnd->sc_dk.dk_label,
928 				    vnd->sc_dk.dk_cpulabel);
929 		}
930 
931 		vnd->sc_flags &= ~VNF_LABELLING;
932 		vndunlock(vnd);
933 		return (error);
934 
935 	case DIOCWLABEL:
936 		if ((flag & FWRITE) == 0)
937 			return (EBADF);
938 		if (*(int *)addr)
939 			vnd->sc_flags |= VNF_WLABEL;
940 		else
941 			vnd->sc_flags &= ~VNF_WLABEL;
942 		return (0);
943 
944 	default:
945 		return (ENOTTY);
946 	}
947 
948 	return (0);
949 }
950 
951 /*
952  * Duplicate the current processes' credentials.  Since we are called only
953  * as the result of a SET ioctl and only root can do that, any future access
954  * to this "disk" is essentially as root.  Note that credentials may change
955  * if some other uid can write directly to the mapped file (NFS).
956  */
957 int
958 vndsetcred(vnd, cred)
959 	struct vnd_softc *vnd;
960 	struct ucred *cred;
961 {
962 	struct uio auio;
963 	struct iovec aiov;
964 	char *tmpbuf;
965 	int error;
966 	struct proc *p = curproc;
967 
968 	vnd->sc_cred = crdup(cred);
969 	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
970 
971 	/* XXX: Horrible kludge to establish credentials for NFS */
972 	aiov.iov_base = tmpbuf;
973 	aiov.iov_len = min(DEV_BSIZE, dbtob(vnd->sc_size));
974 	auio.uio_iov = &aiov;
975 	auio.uio_iovcnt = 1;
976 	auio.uio_offset = 0;
977 	auio.uio_rw = UIO_READ;
978 	auio.uio_segflg = UIO_SYSSPACE;
979 	auio.uio_resid = aiov.iov_len;
980 	vn_lock(vnd->sc_vp, LK_RETRY | LK_EXCLUSIVE, p);
981 	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
982 	VOP_UNLOCK(vnd->sc_vp, 0, p);
983 
984 	free(tmpbuf, M_TEMP);
985 	return (error);
986 }
987 
988 /*
989  * Set maxactive based on FS type
990  */
991 void
992 vndthrottle(vnd, vp)
993 	struct vnd_softc *vnd;
994 	struct vnode *vp;
995 {
996 #ifdef NFSCLIENT
997 	extern int (**nfsv2_vnodeop_p)(void *);
998 
999 	if (vp->v_op == nfsv2_vnodeop_p)
1000 		vnd->sc_maxactive = 2;
1001 	else
1002 #endif
1003 		vnd->sc_maxactive = 8;
1004 }
1005 
1006 void
1007 vndshutdown()
1008 {
1009 	struct vnd_softc *vnd;
1010 
1011 	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
1012 		if (vnd->sc_flags & VNF_INITED)
1013 			vndclear(vnd);
1014 }
1015 
1016 void
1017 vndclear(vnd)
1018 	struct vnd_softc *vnd;
1019 {
1020 	struct vnode *vp = vnd->sc_vp;
1021 	struct proc *p = curproc;		/* XXX */
1022 
1023 #ifdef DEBUG
1024 	if (vnddebug & VDB_FOLLOW)
1025 		printf("vndclear(%p): vp %p\n", vnd, vp);
1026 #endif
1027 	vnd->sc_flags &= ~VNF_INITED;
1028 	if (vp == (struct vnode *)0)
1029 		panic("vndioctl: null vp");
1030 	(void) vn_close(vp, FREAD|FWRITE, vnd->sc_cred, p);
1031 	crfree(vnd->sc_cred);
1032 	vnd->sc_vp = (struct vnode *)0;
1033 	vnd->sc_cred = (struct ucred *)0;
1034 	vnd->sc_size = 0;
1035 }
1036 
1037 int
1038 vndsize(dev)
1039 	dev_t dev;
1040 {
1041 	int unit = vndunit(dev);
1042 	struct vnd_softc *vnd = &vnd_softc[unit];
1043 
1044 	if (unit >= numvnd || (vnd->sc_flags & VNF_INITED) == 0)
1045 		return (-1);
1046 	return (vnd->sc_size);
1047 }
1048 
1049 int
1050 vnddump(dev, blkno, va, size)
1051 	dev_t dev;
1052 	daddr_t blkno;
1053 	caddr_t va;
1054 	size_t size;
1055 {
1056 
1057 	/* Not implemented. */
1058 	return (ENXIO);
1059 }
1060 
1061 /*
1062  * Wait interruptibly for an exclusive lock.
1063  *
1064  * XXX
1065  * Several drivers do this; it should be abstracted and made MP-safe.
1066  */
1067 int
1068 vndlock(sc)
1069 	struct vnd_softc *sc;
1070 {
1071 	int error;
1072 
1073 	while ((sc->sc_flags & VNF_LOCKED) != 0) {
1074 		sc->sc_flags |= VNF_WANTED;
1075 		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
1076 			return (error);
1077 	}
1078 	sc->sc_flags |= VNF_LOCKED;
1079 	return (0);
1080 }
1081 
1082 /*
1083  * Unlock and wake up any waiters.
1084  */
1085 void
1086 vndunlock(sc)
1087 	struct vnd_softc *sc;
1088 {
1089 
1090 	sc->sc_flags &= ~VNF_LOCKED;
1091 	if ((sc->sc_flags & VNF_WANTED) != 0) {
1092 		sc->sc_flags &= ~VNF_WANTED;
1093 		wakeup(sc);
1094 	}
1095 }
1096