xref: /netbsd-src/sys/dev/vnd.c (revision 8a8f936f250a330d54f8a24ed0e92aadf9743a7b)
1 /*	$NetBSD: vnd.c,v 1.73 2001/09/30 12:32:09 chs Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * Copyright (c) 1988 University of Utah.
41  * Copyright (c) 1990, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  *
44  * This code is derived from software contributed to Berkeley by
45  * the Systems Programming Group of the University of Utah Computer
46  * Science Department.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  * 3. All advertising materials mentioning features or use of this software
57  *    must display the following acknowledgement:
58  *	This product includes software developed by the University of
59  *	California, Berkeley and its contributors.
60  * 4. Neither the name of the University nor the names of its contributors
61  *    may be used to endorse or promote products derived from this software
62  *    without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  *
76  * from: Utah $Hdr: vn.c 1.13 94/04/02$
77  *
78  *	@(#)vn.c	8.9 (Berkeley) 5/14/95
79  */
80 
81 /*
82  * Vnode disk driver.
83  *
84  * Block/character interface to a vnode.  Allows one to treat a file
85  * as a disk (e.g. build a filesystem in it, mount it, etc.).
86  *
87  * NOTE 1: This uses the VOP_BMAP/VOP_STRATEGY interface to the vnode
88  * instead of a simple VOP_RDWR.  We do this to avoid distorting the
89  * local buffer cache.
90  *
91  * NOTE 2: There is a security issue involved with this driver.
92  * Once mounted all access to the contents of the "mapped" file via
93  * the special file is controlled by the permissions on the special
94  * file, the protection of the mapped file is ignored (effectively,
95  * by using root credentials in all transactions).
96  *
97  * NOTE 3: Doesn't interact with leases, should it?
98  */
99 
100 #include "fs_nfs.h"
101 
102 #include <sys/param.h>
103 #include <sys/systm.h>
104 #include <sys/namei.h>
105 #include <sys/proc.h>
106 #include <sys/errno.h>
107 #include <sys/buf.h>
108 #include <sys/malloc.h>
109 #include <sys/ioctl.h>
110 #include <sys/disklabel.h>
111 #include <sys/device.h>
112 #include <sys/disk.h>
113 #include <sys/stat.h>
114 #include <sys/mount.h>
115 #include <sys/vnode.h>
116 #include <sys/file.h>
117 #include <sys/uio.h>
118 #include <sys/conf.h>
119 
120 #include <miscfs/specfs/specdev.h>
121 
122 #include <dev/vndvar.h>
123 
124 #if defined(VNDDEBUG) && !defined(DEBUG)
125 #define	DEBUG
126 #endif
127 
128 #ifdef DEBUG
129 int dovndcluster = 1;
130 #define	VDB_FOLLOW	0x01
131 #define	VDB_INIT	0x02
132 #define	VDB_IO		0x04
133 #define	VDB_LABEL	0x08
134 int vnddebug = 0x00;
135 #endif
136 
137 #define	vndunit(x)	DISKUNIT(x)
138 
139 struct vndxfer {
140 	struct buf	*vx_bp;		/* Pointer to parent buffer */
141 	int		vx_error;
142 	int		vx_pending;	/* # of pending aux buffers */
143 	int		vx_flags;
144 #define VX_BUSY		1
145 };
146 
147 struct vndbuf {
148 	struct buf	vb_buf;
149 	struct vndxfer	*vb_xfer;
150 };
151 
152 #define	VND_GETXFER(vnd)	pool_get(&(vnd)->sc_vxpool, PR_NOWAIT)
153 #define	VND_PUTXFER(vnd, vx)	pool_put(&(vnd)->sc_vxpool, (vx))
154 
155 #define	VND_GETBUF(vnd)		pool_get(&(vnd)->sc_vbpool, PR_NOWAIT)
156 #define	VND_PUTBUF(vnd, vb)	pool_put(&(vnd)->sc_vbpool, (vb))
157 
158 struct vnd_softc *vnd_softc;
159 int numvnd = 0;
160 
161 #define	VNDLABELDEV(dev) \
162 	(MAKEDISKDEV(major((dev)), vndunit((dev)), RAW_PART))
163 
164 /* called by main() at boot time */
165 void	vndattach __P((int));
166 
167 void	vndclear __P((struct vnd_softc *));
168 void	vndstart __P((struct vnd_softc *));
169 int	vndsetcred __P((struct vnd_softc *, struct ucred *));
170 void	vndthrottle __P((struct vnd_softc *, struct vnode *));
171 void	vndiodone __P((struct buf *));
172 void	vndshutdown __P((void));
173 
174 void	vndgetdefaultlabel __P((struct vnd_softc *, struct disklabel *));
175 void	vndgetdisklabel __P((dev_t));
176 
177 static	int vndlock __P((struct vnd_softc *));
178 static	void vndunlock __P((struct vnd_softc *));
179 
180 void
181 vndattach(num)
182 	int num;
183 {
184 	int i;
185 	char *mem;
186 
187 	if (num <= 0)
188 		return;
189 	i = num * sizeof(struct vnd_softc);
190 	mem = malloc(i, M_DEVBUF, M_NOWAIT);
191 	if (mem == NULL) {
192 		printf("WARNING: no memory for vnode disks\n");
193 		return;
194 	}
195 	memset(mem, 0, i);
196 	vnd_softc = (struct vnd_softc *)mem;
197 	numvnd = num;
198 
199 	for (i = 0; i < numvnd; i++)
200 		BUFQ_INIT(&vnd_softc[i].sc_tab);
201 }
202 
203 int
204 vndopen(dev, flags, mode, p)
205 	dev_t dev;
206 	int flags, mode;
207 	struct proc *p;
208 {
209 	int unit = vndunit(dev);
210 	struct vnd_softc *sc;
211 	int error = 0, part, pmask;
212 	struct disklabel *lp;
213 
214 #ifdef DEBUG
215 	if (vnddebug & VDB_FOLLOW)
216 		printf("vndopen(0x%x, 0x%x, 0x%x, %p)\n", dev, flags, mode, p);
217 #endif
218 	if (unit >= numvnd)
219 		return (ENXIO);
220 	sc = &vnd_softc[unit];
221 
222 	if ((error = vndlock(sc)) != 0)
223 		return (error);
224 
225 	lp = sc->sc_dkdev.dk_label;
226 
227 	part = DISKPART(dev);
228 	pmask = (1 << part);
229 
230 	/*
231 	 * If we're initialized, check to see if there are any other
232 	 * open partitions.  If not, then it's safe to update the
233 	 * in-core disklabel.
234 	 */
235 	if ((sc->sc_flags & VNF_INITED) && (sc->sc_dkdev.dk_openmask == 0))
236 		vndgetdisklabel(dev);
237 
238 	/* Check that the partitions exists. */
239 	if (part != RAW_PART) {
240 		if (((sc->sc_flags & VNF_INITED) == 0) ||
241 		    ((part >= lp->d_npartitions) ||
242 		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
243 			error = ENXIO;
244 			goto done;
245 		}
246 	}
247 
248 	/* Prevent our unit from being unconfigured while open. */
249 	switch (mode) {
250 	case S_IFCHR:
251 		sc->sc_dkdev.dk_copenmask |= pmask;
252 		break;
253 
254 	case S_IFBLK:
255 		sc->sc_dkdev.dk_bopenmask |= pmask;
256 		break;
257 	}
258 	sc->sc_dkdev.dk_openmask =
259 	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
260 
261  done:
262 	vndunlock(sc);
263 	return (error);
264 }
265 
266 int
267 vndclose(dev, flags, mode, p)
268 	dev_t dev;
269 	int flags, mode;
270 	struct proc *p;
271 {
272 	int unit = vndunit(dev);
273 	struct vnd_softc *sc;
274 	int error = 0, part;
275 
276 #ifdef DEBUG
277 	if (vnddebug & VDB_FOLLOW)
278 		printf("vndclose(0x%x, 0x%x, 0x%x, %p)\n", dev, flags, mode, p);
279 #endif
280 
281 	if (unit >= numvnd)
282 		return (ENXIO);
283 	sc = &vnd_softc[unit];
284 
285 	if ((error = vndlock(sc)) != 0)
286 		return (error);
287 
288 	part = DISKPART(dev);
289 
290 	/* ...that much closer to allowing unconfiguration... */
291 	switch (mode) {
292 	case S_IFCHR:
293 		sc->sc_dkdev.dk_copenmask &= ~(1 << part);
294 		break;
295 
296 	case S_IFBLK:
297 		sc->sc_dkdev.dk_bopenmask &= ~(1 << part);
298 		break;
299 	}
300 	sc->sc_dkdev.dk_openmask =
301 	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
302 
303 	vndunlock(sc);
304 	return (0);
305 }
306 
307 /*
308  * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
309  */
310 void
311 vndstrategy(bp)
312 	struct buf *bp;
313 {
314 	int unit = vndunit(bp->b_dev);
315 	struct vnd_softc *vnd = &vnd_softc[unit];
316 	struct vndxfer *vnx;
317 	int s, bsize, resid;
318 	off_t bn;
319 	caddr_t addr;
320 	int sz, flags, error, wlabel;
321 	struct disklabel *lp;
322 	struct partition *pp;
323 
324 #ifdef DEBUG
325 	if (vnddebug & VDB_FOLLOW)
326 		printf("vndstrategy(%p): unit %d\n", bp, unit);
327 #endif
328 	if ((vnd->sc_flags & VNF_INITED) == 0) {
329 		bp->b_error = ENXIO;
330 		bp->b_flags |= B_ERROR;
331 		goto done;
332 	}
333 
334 	/* If it's a nil transfer, wake up the top half now. */
335 	if (bp->b_bcount == 0)
336 		goto done;
337 
338 	lp = vnd->sc_dkdev.dk_label;
339 
340 	/*
341 	 * The transfer must be a whole number of blocks.
342 	 */
343 	if ((bp->b_bcount % lp->d_secsize) != 0) {
344 		bp->b_error = EINVAL;
345 		bp->b_flags |= B_ERROR;
346 		goto done;
347 	}
348 
349 	/*
350 	 * Do bounds checking and adjust transfer.  If there's an error,
351 	 * the bounds check will flag that for us.
352 	 */
353 	wlabel = vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING);
354 	if (DISKPART(bp->b_dev) != RAW_PART)
355 		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
356 			goto done;
357 
358 	bp->b_resid = bp->b_bcount;
359 
360 	/*
361 	 * Put the block number in terms of the logical blocksize
362 	 * of the "device".
363 	 */
364 	bn = bp->b_blkno / (lp->d_secsize / DEV_BSIZE);
365 
366 	/*
367 	 * Translate the partition-relative block number to an absolute.
368 	 */
369 	if (DISKPART(bp->b_dev) != RAW_PART) {
370 		pp = &vnd->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
371 		bn += pp->p_offset;
372 	}
373 
374 	/* ...and convert to a byte offset within the file. */
375 	bn *= lp->d_secsize;
376 
377 	if (vnd->sc_vp->v_mount == NULL) {
378 		bp->b_error = ENXIO;
379 		bp->b_flags |= B_ERROR;
380 		goto done;
381 	}
382  	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
383 	addr = bp->b_data;
384 	flags = (bp->b_flags & (B_READ|B_ASYNC)) | B_CALL;
385 
386 	/* Allocate a header for this transfer and link it to the buffer */
387 	s = splbio();
388 	vnx = VND_GETXFER(vnd);
389 	splx(s);
390 	vnx->vx_flags = VX_BUSY;
391 	vnx->vx_error = 0;
392 	vnx->vx_pending = 0;
393 	vnx->vx_bp = bp;
394 
395 	for (resid = bp->b_resid; resid; resid -= sz) {
396 		struct vndbuf *nbp;
397 		struct vnode *vp;
398 		daddr_t nbn;
399 		int off, nra;
400 
401 		nra = 0;
402 		vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE);
403 		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
404 		VOP_UNLOCK(vnd->sc_vp, 0);
405 
406 		if (error == 0 && (long)nbn == -1)
407 			error = EIO;
408 
409 		/*
410 		 * If there was an error or a hole in the file...punt.
411 		 * Note that we may have to wait for any operations
412 		 * that we have already fired off before releasing
413 		 * the buffer.
414 		 *
415 		 * XXX we could deal with holes here but it would be
416 		 * a hassle (in the write case).
417 		 */
418 		if (error) {
419 			s = splbio();
420 			vnx->vx_error = error;
421 			goto out;
422 		}
423 
424 #ifdef DEBUG
425 		if (!dovndcluster)
426 			nra = 0;
427 #endif
428 
429 		if ((off = bn % bsize) != 0)
430 			sz = bsize - off;
431 		else
432 			sz = (1 + nra) * bsize;
433 		if (resid < sz)
434 			sz = resid;
435 #ifdef DEBUG
436 		if (vnddebug & VDB_IO)
437 			printf("vndstrategy: vp %p/%p bn 0x%qx/0x%x sz 0x%x\n",
438 			    vnd->sc_vp, vp, (long long)bn, nbn, sz);
439 #endif
440 
441 		s = splbio();
442 		nbp = VND_GETBUF(vnd);
443 		splx(s);
444 		nbp->vb_buf.b_flags = flags;
445 		nbp->vb_buf.b_bcount = sz;
446 		nbp->vb_buf.b_bufsize = bp->b_bufsize;
447 		nbp->vb_buf.b_error = 0;
448 		nbp->vb_buf.b_data = addr;
449 		nbp->vb_buf.b_blkno = nbp->vb_buf.b_rawblkno = nbn + btodb(off);
450 		nbp->vb_buf.b_proc = bp->b_proc;
451 		nbp->vb_buf.b_iodone = vndiodone;
452 		nbp->vb_buf.b_vp = NULLVP;
453 		LIST_INIT(&nbp->vb_buf.b_dep);
454 
455 		nbp->vb_xfer = vnx;
456 
457 		/*
458 		 * Just sort by block number
459 		 */
460 		s = splbio();
461 		if (vnx->vx_error != 0) {
462 			VND_PUTBUF(vnd, nbp);
463 			goto out;
464 		}
465 		vnx->vx_pending++;
466 		bgetvp(vp, &nbp->vb_buf);
467 		disksort_blkno(&vnd->sc_tab, &nbp->vb_buf);
468 		vndstart(vnd);
469 		splx(s);
470 		bn += sz;
471 		addr += sz;
472 	}
473 
474 	s = splbio();
475 
476 out: /* Arrive here at splbio */
477 	vnx->vx_flags &= ~VX_BUSY;
478 	if (vnx->vx_pending == 0) {
479 		if (vnx->vx_error != 0) {
480 			bp->b_error = vnx->vx_error;
481 			bp->b_flags |= B_ERROR;
482 		}
483 		VND_PUTXFER(vnd, vnx);
484 		biodone(bp);
485 	}
486 	splx(s);
487 	return;
488 
489  done:
490 	biodone(bp);
491 }
492 
493 /*
494  * Feed requests sequentially.
495  * We do it this way to keep from flooding NFS servers if we are connected
496  * to an NFS file.  This places the burden on the client rather than the
497  * server.
498  */
499 void
500 vndstart(vnd)
501 	struct vnd_softc *vnd;
502 {
503 	struct buf	*bp;
504 
505 	/*
506 	 * Dequeue now since lower level strategy routine might
507 	 * queue using same links
508 	 */
509 
510 	if ((vnd->sc_flags & VNF_BUSY) != 0)
511 		return;
512 
513 	vnd->sc_flags |= VNF_BUSY;
514 
515 	while (vnd->sc_active < vnd->sc_maxactive) {
516 		bp = BUFQ_FIRST(&vnd->sc_tab);
517 		if (bp == NULL)
518 			break;
519 		BUFQ_REMOVE(&vnd->sc_tab, bp);
520 		vnd->sc_active++;
521 #ifdef DEBUG
522 		if (vnddebug & VDB_IO)
523 			printf("vndstart(%ld): bp %p vp %p blkno 0x%x"
524 				" flags %lx addr %p cnt 0x%lx\n",
525 			    (long) (vnd-vnd_softc), bp, bp->b_vp, bp->b_blkno,
526 			    bp->b_flags, bp->b_data, bp->b_bcount);
527 #endif
528 
529 		/* Instrumentation. */
530 		disk_busy(&vnd->sc_dkdev);
531 
532 		if ((bp->b_flags & B_READ) == 0)
533 			bp->b_vp->v_numoutput++;
534 		VOP_STRATEGY(bp);
535 	}
536 	vnd->sc_flags &= ~VNF_BUSY;
537 }
538 
539 void
540 vndiodone(bp)
541 	struct buf *bp;
542 {
543 	struct vndbuf *vbp = (struct vndbuf *) bp;
544 	struct vndxfer *vnx = (struct vndxfer *)vbp->vb_xfer;
545 	struct buf *pbp = vnx->vx_bp;
546 	struct vnd_softc *vnd = &vnd_softc[vndunit(pbp->b_dev)];
547 	int s, resid;
548 
549 	s = splbio();
550 #ifdef DEBUG
551 	if (vnddebug & VDB_IO)
552 		printf("vndiodone(%ld): vbp %p vp %p blkno 0x%x addr %p cnt 0x%lx\n",
553 		    (long) (vnd-vnd_softc), vbp, vbp->vb_buf.b_vp,
554 		    vbp->vb_buf.b_blkno, vbp->vb_buf.b_data,
555 		    vbp->vb_buf.b_bcount);
556 #endif
557 
558 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
559 	pbp->b_resid -= resid;
560 	disk_unbusy(&vnd->sc_dkdev, resid);
561 	vnx->vx_pending--;
562 
563 	if (vbp->vb_buf.b_error) {
564 #ifdef DEBUG
565 		if (vnddebug & VDB_IO)
566 			printf("vndiodone: vbp %p error %d\n", vbp,
567 			    vbp->vb_buf.b_error);
568 #endif
569 		vnx->vx_error = vbp->vb_buf.b_error;
570 	}
571 
572 	if (vbp->vb_buf.b_vp != NULLVP)
573 		brelvp(&vbp->vb_buf);
574 
575 	VND_PUTBUF(vnd, vbp);
576 
577 	/*
578 	 * Wrap up this transaction if it has run to completion or, in
579 	 * case of an error, when all auxiliary buffers have returned.
580 	 */
581 	if (vnx->vx_error != 0) {
582 		pbp->b_flags |= B_ERROR;
583 		pbp->b_error = vnx->vx_error;
584 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
585 
586 #ifdef DEBUG
587 			if (vnddebug & VDB_IO)
588 				printf("vndiodone: pbp %p iodone: error %d\n",
589 					pbp, vnx->vx_error);
590 #endif
591 			VND_PUTXFER(vnd, vnx);
592 			biodone(pbp);
593 		}
594 	} else if (pbp->b_resid == 0) {
595 
596 #ifdef DIAGNOSTIC
597 		if (vnx->vx_pending != 0)
598 			panic("vndiodone: vnx pending: %d", vnx->vx_pending);
599 #endif
600 
601 		if ((vnx->vx_flags & VX_BUSY) == 0) {
602 #ifdef DEBUG
603 			if (vnddebug & VDB_IO)
604 				printf("vndiodone: pbp %p iodone\n", pbp);
605 #endif
606 			VND_PUTXFER(vnd, vnx);
607 			biodone(pbp);
608 		}
609 	}
610 
611 	vnd->sc_active--;
612 	vndstart(vnd);
613 	splx(s);
614 }
615 
616 /* ARGSUSED */
617 int
618 vndread(dev, uio, flags)
619 	dev_t dev;
620 	struct uio *uio;
621 	int flags;
622 {
623 	int unit = vndunit(dev);
624 	struct vnd_softc *sc;
625 
626 #ifdef DEBUG
627 	if (vnddebug & VDB_FOLLOW)
628 		printf("vndread(0x%x, %p)\n", dev, uio);
629 #endif
630 
631 	if (unit >= numvnd)
632 		return (ENXIO);
633 	sc = &vnd_softc[unit];
634 
635 	if ((sc->sc_flags & VNF_INITED) == 0)
636 		return (ENXIO);
637 
638 	return (physio(vndstrategy, NULL, dev, B_READ, minphys, uio));
639 }
640 
641 /* ARGSUSED */
642 int
643 vndwrite(dev, uio, flags)
644 	dev_t dev;
645 	struct uio *uio;
646 	int flags;
647 {
648 	int unit = vndunit(dev);
649 	struct vnd_softc *sc;
650 
651 #ifdef DEBUG
652 	if (vnddebug & VDB_FOLLOW)
653 		printf("vndwrite(0x%x, %p)\n", dev, uio);
654 #endif
655 
656 	if (unit >= numvnd)
657 		return (ENXIO);
658 	sc = &vnd_softc[unit];
659 
660 	if ((sc->sc_flags & VNF_INITED) == 0)
661 		return (ENXIO);
662 
663 	return (physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio));
664 }
665 
666 /* ARGSUSED */
667 int
668 vndioctl(dev, cmd, data, flag, p)
669 	dev_t dev;
670 	u_long cmd;
671 	caddr_t data;
672 	int flag;
673 	struct proc *p;
674 {
675 	int unit = vndunit(dev);
676 	struct vnd_softc *vnd;
677 	struct vnd_ioctl *vio;
678 	struct vattr vattr;
679 	struct nameidata nd;
680 	int error, part, pmask;
681 	size_t geomsize;
682 #ifdef __HAVE_OLD_DISKLABEL
683 	struct disklabel newlabel;
684 #endif
685 
686 #ifdef DEBUG
687 	if (vnddebug & VDB_FOLLOW)
688 		printf("vndioctl(0x%x, 0x%lx, %p, 0x%x, %p): unit %d\n",
689 		    dev, cmd, data, flag, p, unit);
690 #endif
691 	error = suser(p->p_ucred, &p->p_acflag);
692 	if (error)
693 		return (error);
694 	if (unit >= numvnd)
695 		return (ENXIO);
696 
697 	vnd = &vnd_softc[unit];
698 	vio = (struct vnd_ioctl *)data;
699 
700 	/* Must be open for writes for these commands... */
701 	switch (cmd) {
702 	case VNDIOCSET:
703 	case VNDIOCCLR:
704 	case DIOCSDINFO:
705 	case DIOCWDINFO:
706 #ifdef __HAVE_OLD_DISKLABEL
707 	case ODIOCSDINFO:
708 	case ODIOCWDINFO:
709 #endif
710 	case DIOCWLABEL:
711 		if ((flag & FWRITE) == 0)
712 			return (EBADF);
713 	}
714 
715 	/* Must be initialized for these... */
716 	switch (cmd) {
717 	case VNDIOCCLR:
718 	case DIOCGDINFO:
719 	case DIOCSDINFO:
720 	case DIOCWDINFO:
721 	case DIOCGPART:
722 	case DIOCWLABEL:
723 	case DIOCGDEFLABEL:
724 #ifdef __HAVE_OLD_DISKLABEL
725 	case ODIOCGDINFO:
726 	case ODIOCSDINFO:
727 	case ODIOCWDINFO:
728 	case ODIOCGDEFLABEL:
729 #endif
730 		if ((vnd->sc_flags & VNF_INITED) == 0)
731 			return (ENXIO);
732 	}
733 
734 	switch (cmd) {
735 	case VNDIOCSET:
736 		if (vnd->sc_flags & VNF_INITED)
737 			return (EBUSY);
738 
739 		if ((error = vndlock(vnd)) != 0)
740 			return (error);
741 
742 		/*
743 		 * Always open for read and write.
744 		 * This is probably bogus, but it lets vn_open()
745 		 * weed out directories, sockets, etc. so we don't
746 		 * have to worry about them.
747 		 */
748 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vnd_file, p);
749 		if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
750 			vndunlock(vnd);
751 			return(error);
752 		}
753 		error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p);
754 		if (error) {
755 			VOP_UNLOCK(nd.ni_vp, 0);
756 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
757 			vndunlock(vnd);
758 			return(error);
759 		}
760 		VOP_UNLOCK(nd.ni_vp, 0);
761 		vnd->sc_vp = nd.ni_vp;
762 		vnd->sc_size = btodb(vattr.va_size);	/* note truncation */
763 
764 		/*
765 		 * Use pseudo-geometry specified.  If none was provided,
766 		 * use "standard" Adaptec fictitious geometry.
767 		 */
768 		if (vio->vnd_flags & VNDIOF_HASGEOM) {
769 
770 			memcpy(&vnd->sc_geom, &vio->vnd_geom,
771 			    sizeof(vio->vnd_geom));
772 
773 			/*
774 			 * Sanity-check the sector size.
775 			 * XXX Don't allow secsize < DEV_BSIZE.  Should
776 			 * XXX we?
777 			 */
778 			if (vnd->sc_geom.vng_secsize < DEV_BSIZE ||
779 			    (vnd->sc_geom.vng_secsize % DEV_BSIZE) != 0) {
780 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
781 				    p->p_ucred, p);
782 				vndunlock(vnd);
783 				return (EINVAL);
784 			}
785 
786 			/*
787 			 * Compute the size (in DEV_BSIZE blocks) specified
788 			 * by the geometry.
789 			 */
790 			geomsize = (vnd->sc_geom.vng_nsectors *
791 			    vnd->sc_geom.vng_ntracks *
792 			    vnd->sc_geom.vng_ncylinders) *
793 			    (vnd->sc_geom.vng_secsize / DEV_BSIZE);
794 
795 			/*
796 			 * Sanity-check the size against the specified
797 			 * geometry.
798 			 */
799 			if (vnd->sc_size < geomsize) {
800 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
801 				    p->p_ucred, p);
802 				vndunlock(vnd);
803 				return (EINVAL);
804 			}
805 		} else {
806 			/*
807 			 * Size must be at least 2048 DEV_BSIZE blocks
808 			 * (1M) in order to use this geometry.
809 			 */
810 			if (vnd->sc_size < (32 * 64)) {
811 				vndunlock(vnd);
812 				return (EINVAL);
813 			}
814 
815 			vnd->sc_geom.vng_secsize = DEV_BSIZE;
816 			vnd->sc_geom.vng_nsectors = 32;
817 			vnd->sc_geom.vng_ntracks = 64;
818 			vnd->sc_geom.vng_ncylinders = vnd->sc_size / (64 * 32);
819 
820 			/*
821 			 * Compute the actual size allowed by this geometry.
822 			 */
823 			geomsize = 32 * 64 * vnd->sc_geom.vng_ncylinders;
824 		}
825 
826 		/*
827 		 * Truncate the size to that specified by
828 		 * the geometry.
829 		 * XXX Should we even bother with this?
830 		 */
831 		vnd->sc_size = geomsize;
832 
833 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0) {
834 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
835 			vndunlock(vnd);
836 			return(error);
837 		}
838 		vndthrottle(vnd, vnd->sc_vp);
839 		vio->vnd_size = dbtob(vnd->sc_size);
840 		vnd->sc_flags |= VNF_INITED;
841 #ifdef DEBUG
842 		if (vnddebug & VDB_INIT)
843 			printf("vndioctl: SET vp %p size 0x%lx %d/%d/%d/%d\n",
844 			    vnd->sc_vp, (unsigned long) vnd->sc_size,
845 			    vnd->sc_geom.vng_secsize,
846 			    vnd->sc_geom.vng_nsectors,
847 			    vnd->sc_geom.vng_ntracks,
848 			    vnd->sc_geom.vng_ncylinders);
849 #endif
850 
851 		/* Attach the disk. */
852 		memset(vnd->sc_xname, 0, sizeof(vnd->sc_xname)); /* XXX */
853 		sprintf(vnd->sc_xname, "vnd%d", unit);		/* XXX */
854 		vnd->sc_dkdev.dk_name = vnd->sc_xname;
855 		disk_attach(&vnd->sc_dkdev);
856 
857 		/* Initialize the xfer and buffer pools. */
858 		pool_init(&vnd->sc_vxpool, sizeof(struct vndxfer), 0,
859 		    0, 0, "vndxpl", 0, NULL, NULL, M_DEVBUF);
860 		pool_init(&vnd->sc_vbpool, sizeof(struct vndbuf), 0,
861 		    0, 0, "vndbpl", 0, NULL, NULL, M_DEVBUF);
862 
863 		/* Try and read the disklabel. */
864 		vndgetdisklabel(dev);
865 
866 		vndunlock(vnd);
867 
868 		break;
869 
870 	case VNDIOCCLR:
871 		if ((error = vndlock(vnd)) != 0)
872 			return (error);
873 
874 		/*
875 		 * Don't unconfigure if any other partitions are open
876 		 * or if both the character and block flavors of this
877 		 * partition are open.
878 		 */
879 		part = DISKPART(dev);
880 		pmask = (1 << part);
881 		if ((vnd->sc_dkdev.dk_openmask & ~pmask) ||
882 		    ((vnd->sc_dkdev.dk_bopenmask & pmask) &&
883 		    (vnd->sc_dkdev.dk_copenmask & pmask))) {
884 			vndunlock(vnd);
885 			return (EBUSY);
886 		}
887 
888 		vndclear(vnd);
889 #ifdef DEBUG
890 		if (vnddebug & VDB_INIT)
891 			printf("vndioctl: CLRed\n");
892 #endif
893 
894 		/* Destroy the xfer and buffer pools. */
895 		pool_destroy(&vnd->sc_vxpool);
896 		pool_destroy(&vnd->sc_vbpool);
897 
898 		/* Detatch the disk. */
899 		disk_detach(&vnd->sc_dkdev);
900 
901 		vndunlock(vnd);
902 
903 		break;
904 
905 	case DIOCGDINFO:
906 		*(struct disklabel *)data = *(vnd->sc_dkdev.dk_label);
907 		break;
908 
909 #ifdef __HAVE_OLD_DISKLABEL
910 	case ODIOCGDINFO:
911 		newlabel = *(vnd->sc_dkdev.dk_label);
912 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
913 			return ENOTTY;
914 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
915 		break;
916 #endif
917 
918 	case DIOCGPART:
919 		((struct partinfo *)data)->disklab = vnd->sc_dkdev.dk_label;
920 		((struct partinfo *)data)->part =
921 		    &vnd->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
922 		break;
923 
924 	case DIOCWDINFO:
925 	case DIOCSDINFO:
926 #ifdef __HAVE_OLD_DISKLABEL
927 	case ODIOCWDINFO:
928 	case ODIOCSDINFO:
929 #endif
930 	{
931 		struct disklabel *lp;
932 
933 		if ((error = vndlock(vnd)) != 0)
934 			return (error);
935 
936 		vnd->sc_flags |= VNF_LABELLING;
937 
938 #ifdef __HAVE_OLD_DISKLABEL
939 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
940 			memset(&newlabel, 0, sizeof newlabel);
941 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
942 			lp = &newlabel;
943 		} else
944 #endif
945 		lp = (struct disklabel *)data;
946 
947 		error = setdisklabel(vnd->sc_dkdev.dk_label,
948 		    lp, 0, vnd->sc_dkdev.dk_cpulabel);
949 		if (error == 0) {
950 			if (cmd == DIOCWDINFO
951 #ifdef __HAVE_OLD_DISKLABEL
952 			    || cmd == ODIOCWDINFO
953 #endif
954 			   )
955 				error = writedisklabel(VNDLABELDEV(dev),
956 				    vndstrategy, vnd->sc_dkdev.dk_label,
957 				    vnd->sc_dkdev.dk_cpulabel);
958 		}
959 
960 		vnd->sc_flags &= ~VNF_LABELLING;
961 
962 		vndunlock(vnd);
963 
964 		if (error)
965 			return (error);
966 		break;
967 	}
968 
969 	case DIOCWLABEL:
970 		if (*(int *)data != 0)
971 			vnd->sc_flags |= VNF_WLABEL;
972 		else
973 			vnd->sc_flags &= ~VNF_WLABEL;
974 		break;
975 
976 	case DIOCGDEFLABEL:
977 		vndgetdefaultlabel(vnd, (struct disklabel *)data);
978 		break;
979 
980 #ifdef __HAVE_OLD_DISKLABEL
981 	case ODIOCGDEFLABEL:
982 		vndgetdefaultlabel(vnd, &newlabel);
983 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
984 			return ENOTTY;
985 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
986 		break;
987 #endif
988 
989 	default:
990 		return (ENOTTY);
991 	}
992 
993 	return (0);
994 }
995 
996 /*
997  * Duplicate the current processes' credentials.  Since we are called only
998  * as the result of a SET ioctl and only root can do that, any future access
999  * to this "disk" is essentially as root.  Note that credentials may change
1000  * if some other uid can write directly to the mapped file (NFS).
1001  */
1002 int
1003 vndsetcred(vnd, cred)
1004 	struct vnd_softc *vnd;
1005 	struct ucred *cred;
1006 {
1007 	struct uio auio;
1008 	struct iovec aiov;
1009 	char *tmpbuf;
1010 	int error;
1011 
1012 	vnd->sc_cred = crdup(cred);
1013 	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
1014 
1015 	/* XXX: Horrible kludge to establish credentials for NFS */
1016 	aiov.iov_base = tmpbuf;
1017 	aiov.iov_len = min(DEV_BSIZE, dbtob(vnd->sc_size));
1018 	auio.uio_iov = &aiov;
1019 	auio.uio_iovcnt = 1;
1020 	auio.uio_offset = 0;
1021 	auio.uio_rw = UIO_READ;
1022 	auio.uio_segflg = UIO_SYSSPACE;
1023 	auio.uio_resid = aiov.iov_len;
1024 	vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
1025 	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
1026 	if (error == 0) {
1027 		/*
1028 		 * Because vnd does all IO directly through the vnode
1029 		 * we need to flush (at least) the buffer from the above
1030 		 * VOP_READ from the buffer cache to prevent cache
1031 		 * incoherencies.  Also, be careful to write dirty
1032 		 * buffers back to stable storage.
1033 		 */
1034 		error = vinvalbuf(vnd->sc_vp, V_SAVE, vnd->sc_cred,
1035 			    curproc, 0, 0);
1036 	}
1037 	VOP_UNLOCK(vnd->sc_vp, 0);
1038 
1039 	free(tmpbuf, M_TEMP);
1040 	return (error);
1041 }
1042 
1043 /*
1044  * Set maxactive based on FS type
1045  */
1046 void
1047 vndthrottle(vnd, vp)
1048 	struct vnd_softc *vnd;
1049 	struct vnode *vp;
1050 {
1051 #ifdef NFS
1052 	extern int (**nfsv2_vnodeop_p) __P((void *));
1053 
1054 	if (vp->v_op == nfsv2_vnodeop_p)
1055 		vnd->sc_maxactive = 2;
1056 	else
1057 #endif
1058 		vnd->sc_maxactive = 8;
1059 
1060 	if (vnd->sc_maxactive < 1)
1061 		vnd->sc_maxactive = 1;
1062 }
1063 
1064 void
1065 vndshutdown()
1066 {
1067 	struct vnd_softc *vnd;
1068 
1069 	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
1070 		if (vnd->sc_flags & VNF_INITED)
1071 			vndclear(vnd);
1072 }
1073 
1074 void
1075 vndclear(vnd)
1076 	struct vnd_softc *vnd;
1077 {
1078 	struct vnode *vp = vnd->sc_vp;
1079 	struct proc *p = curproc;		/* XXX */
1080 
1081 #ifdef DEBUG
1082 	if (vnddebug & VDB_FOLLOW)
1083 		printf("vndclear(%p): vp %p\n", vnd, vp);
1084 #endif
1085 	vnd->sc_flags &= ~VNF_INITED;
1086 	if (vp == (struct vnode *)0)
1087 		panic("vndioctl: null vp");
1088 	(void) vn_close(vp, FREAD|FWRITE, vnd->sc_cred, p);
1089 	crfree(vnd->sc_cred);
1090 	vnd->sc_vp = (struct vnode *)0;
1091 	vnd->sc_cred = (struct ucred *)0;
1092 	vnd->sc_size = 0;
1093 }
1094 
1095 int
1096 vndsize(dev)
1097 	dev_t dev;
1098 {
1099 	struct vnd_softc *sc;
1100 	struct disklabel *lp;
1101 	int part, unit, omask;
1102 	int size;
1103 
1104 	unit = vndunit(dev);
1105 	if (unit >= numvnd)
1106 		return (-1);
1107 	sc = &vnd_softc[unit];
1108 
1109 	if ((sc->sc_flags & VNF_INITED) == 0)
1110 		return (-1);
1111 
1112 	part = DISKPART(dev);
1113 	omask = sc->sc_dkdev.dk_openmask & (1 << part);
1114 	lp = sc->sc_dkdev.dk_label;
1115 
1116 	if (omask == 0 && vndopen(dev, 0, S_IFBLK, curproc))
1117 		return (-1);
1118 
1119 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1120 		size = -1;
1121 	else
1122 		size = lp->d_partitions[part].p_size *
1123 		    (lp->d_secsize / DEV_BSIZE);
1124 
1125 	if (omask == 0 && vndclose(dev, 0, S_IFBLK, curproc))
1126 		return (-1);
1127 
1128 	return (size);
1129 }
1130 
1131 int
1132 vnddump(dev, blkno, va, size)
1133 	dev_t dev;
1134 	daddr_t blkno;
1135 	caddr_t va;
1136 	size_t size;
1137 {
1138 
1139 	/* Not implemented. */
1140 	return ENXIO;
1141 }
1142 
1143 void
1144 vndgetdefaultlabel(sc, lp)
1145 	struct vnd_softc *sc;
1146 	struct disklabel *lp;
1147 {
1148 	struct vndgeom *vng = &sc->sc_geom;
1149 	struct partition *pp;
1150 
1151 	memset(lp, 0, sizeof(*lp));
1152 
1153 	lp->d_secperunit = sc->sc_size / (vng->vng_secsize / DEV_BSIZE);
1154 	lp->d_secsize = vng->vng_secsize;
1155 	lp->d_nsectors = vng->vng_nsectors;
1156 	lp->d_ntracks = vng->vng_ntracks;
1157 	lp->d_ncylinders = vng->vng_ncylinders;
1158 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1159 
1160 	strncpy(lp->d_typename, "vnd", sizeof(lp->d_typename));
1161 	lp->d_type = DTYPE_VND;
1162 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1163 	lp->d_rpm = 3600;
1164 	lp->d_interleave = 1;
1165 	lp->d_flags = 0;
1166 
1167 	pp = &lp->d_partitions[RAW_PART];
1168 	pp->p_offset = 0;
1169 	pp->p_size = lp->d_secperunit;
1170 	pp->p_fstype = FS_UNUSED;
1171 	lp->d_npartitions = RAW_PART + 1;
1172 
1173 	lp->d_magic = DISKMAGIC;
1174 	lp->d_magic2 = DISKMAGIC;
1175 	lp->d_checksum = dkcksum(lp);
1176 }
1177 
1178 /*
1179  * Read the disklabel from a vnd.  If one is not present, create a fake one.
1180  */
1181 void
1182 vndgetdisklabel(dev)
1183 	dev_t dev;
1184 {
1185 	struct vnd_softc *sc = &vnd_softc[vndunit(dev)];
1186 	char *errstring;
1187 	struct disklabel *lp = sc->sc_dkdev.dk_label;
1188 	struct cpu_disklabel *clp = sc->sc_dkdev.dk_cpulabel;
1189 	int i;
1190 
1191 	memset(clp, 0, sizeof(*clp));
1192 
1193 	vndgetdefaultlabel(sc, lp);
1194 
1195 	/*
1196 	 * Call the generic disklabel extraction routine.
1197 	 */
1198 	errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, clp);
1199 	if (errstring) {
1200 		/*
1201 		 * Lack of disklabel is common, but we print the warning
1202 		 * anyway, since it might contain other useful information.
1203 		 */
1204 		printf("%s: %s\n", sc->sc_xname, errstring);
1205 
1206 		/*
1207 		 * For historical reasons, if there's no disklabel
1208 		 * present, all partitions must be FS_BSDFFS and
1209 		 * occupy the entire disk.
1210 		 */
1211 		for (i = 0; i < MAXPARTITIONS; i++) {
1212 			/*
1213 			 * Don't wipe out port specific hack (such as
1214 			 * dos partition hack of i386 port).
1215 			 */
1216 			if (lp->d_partitions[i].p_fstype != FS_UNUSED)
1217 				continue;
1218 
1219 			lp->d_partitions[i].p_size = lp->d_secperunit;
1220 			lp->d_partitions[i].p_offset = 0;
1221 			lp->d_partitions[i].p_fstype = FS_BSDFFS;
1222 		}
1223 
1224 		strncpy(lp->d_packname, "default label",
1225 		    sizeof(lp->d_packname));
1226 
1227 		lp->d_checksum = dkcksum(lp);
1228 	}
1229 }
1230 
1231 /*
1232  * Wait interruptibly for an exclusive lock.
1233  *
1234  * XXX
1235  * Several drivers do this; it should be abstracted and made MP-safe.
1236  */
1237 static int
1238 vndlock(sc)
1239 	struct vnd_softc *sc;
1240 {
1241 	int error;
1242 
1243 	while ((sc->sc_flags & VNF_LOCKED) != 0) {
1244 		sc->sc_flags |= VNF_WANTED;
1245 		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
1246 			return (error);
1247 	}
1248 	sc->sc_flags |= VNF_LOCKED;
1249 	return (0);
1250 }
1251 
1252 /*
1253  * Unlock and wake up any waiters.
1254  */
1255 static void
1256 vndunlock(sc)
1257 	struct vnd_softc *sc;
1258 {
1259 
1260 	sc->sc_flags &= ~VNF_LOCKED;
1261 	if ((sc->sc_flags & VNF_WANTED) != 0) {
1262 		sc->sc_flags &= ~VNF_WANTED;
1263 		wakeup(sc);
1264 	}
1265 }
1266