xref: /netbsd-src/sys/dev/vnd.c (revision fdecd6a253f999ae92b139670d9e15cc9df4497c)
1 /*	$NetBSD: vnd.c,v 1.44 1997/06/26 16:28:37 kleink Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
30  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * Copyright (c) 1988 University of Utah.
41  * Copyright (c) 1990, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  *
44  * This code is derived from software contributed to Berkeley by
45  * the Systems Programming Group of the University of Utah Computer
46  * Science Department.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  * 3. All advertising materials mentioning features or use of this software
57  *    must display the following acknowledgement:
58  *	This product includes software developed by the University of
59  *	California, Berkeley and its contributors.
60  * 4. Neither the name of the University nor the names of its contributors
61  *    may be used to endorse or promote products derived from this software
62  *    without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  *
76  * from: Utah $Hdr: vn.c 1.13 94/04/02$
77  *
78  *	@(#)vn.c	8.6 (Berkeley) 4/1/94
79  */
80 
81 /*
82  * Vnode disk driver.
83  *
84  * Block/character interface to a vnode.  Allows one to treat a file
85  * as a disk (e.g. build a filesystem in it, mount it, etc.).
86  *
87  * NOTE 1: This uses the VOP_BMAP/VOP_STRATEGY interface to the vnode
88  * instead of a simple VOP_RDWR.  We do this to avoid distorting the
89  * local buffer cache.
90  *
91  * NOTE 2: There is a security issue involved with this driver.
92  * Once mounted all access to the contents of the "mapped" file via
93  * the special file is controlled by the permissions on the special
94  * file, the protection of the mapped file is ignored (effectively,
95  * by using root credentials in all transactions).
96  *
97  * NOTE 3: Doesn't interact with leases, should it?
98  */
99 
100 #include <sys/param.h>
101 #include <sys/systm.h>
102 #include <sys/namei.h>
103 #include <sys/proc.h>
104 #include <sys/errno.h>
105 #include <sys/buf.h>
106 #include <sys/malloc.h>
107 #include <sys/ioctl.h>
108 #include <sys/disklabel.h>
109 #include <sys/device.h>
110 #include <sys/disk.h>
111 #include <sys/stat.h>
112 #include <sys/mount.h>
113 #include <sys/vnode.h>
114 #include <sys/file.h>
115 #include <sys/uio.h>
116 #include <sys/conf.h>
117 
118 #include <miscfs/specfs/specdev.h>
119 
120 #include <dev/vndvar.h>
121 
122 #if defined(VNDDEBUG) && !defined(DEBUG)
123 #define	DEBUG
124 #endif
125 
126 #ifdef DEBUG
127 int dovndcluster = 1;
128 #define	VDB_FOLLOW	0x01
129 #define	VDB_INIT	0x02
130 #define	VDB_IO		0x04
131 #define	VDB_LABEL	0x08
132 int vnddebug = 0x00;
133 #endif
134 
135 #define b_cylin	b_resid
136 
137 #define	vndunit(x)	DISKUNIT(x)
138 
139 struct vndxfer {
140 	struct buf	*vx_bp;		/* Pointer to parent buffer */
141 	int		vx_error;
142 	int		vx_pending;	/* # of pending aux buffers */
143 };
144 
145 struct vndbuf {
146 	struct buf	vb_buf;
147 	struct vndxfer	*vb_xfer;
148 };
149 
150 #define	getvndxfer()	\
151 	((struct vndxfer *)malloc(sizeof(struct vndxfer), M_DEVBUF, M_WAITOK))
152 #define putvndxfer(vnx)	\
153 	free((caddr_t)(vnx), M_DEVBUF)
154 #define	getvndbuf()	\
155 	((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
156 #define putvndbuf(vbp)	\
157 	free((caddr_t)(vbp), M_DEVBUF)
158 
159 struct vnd_softc *vnd_softc;
160 int numvnd = 0;
161 
162 #define	VNDLABELDEV(dev) \
163 	(MAKEDISKDEV(major((dev)), vndunit((dev)), RAW_PART))
164 
165 /* called by main() at boot time */
166 void	vndattach __P((int));
167 
168 void	vndclear __P((struct vnd_softc *));
169 void	vndstart __P((struct vnd_softc *));
170 int	vndsetcred __P((struct vnd_softc *, struct ucred *));
171 void	vndthrottle __P((struct vnd_softc *, struct vnode *));
172 void	vndiodone __P((struct buf *));
173 void	vndshutdown __P((void));
174 
175 void	vndgetdisklabel __P((dev_t));
176 
177 static	int vndlock __P((struct vnd_softc *));
178 static	void vndunlock __P((struct vnd_softc *));
179 
180 void
181 vndattach(num)
182 	int num;
183 {
184 	char *mem;
185 	register u_long size;
186 
187 	if (num <= 0)
188 		return;
189 	size = num * sizeof(struct vnd_softc);
190 	mem = malloc(size, M_DEVBUF, M_NOWAIT);
191 	if (mem == NULL) {
192 		printf("WARNING: no memory for vnode disks\n");
193 		return;
194 	}
195 	bzero(mem, size);
196 	vnd_softc = (struct vnd_softc *)mem;
197 	numvnd = num;
198 }
199 
200 int
201 vndopen(dev, flags, mode, p)
202 	dev_t dev;
203 	int flags, mode;
204 	struct proc *p;
205 {
206 	int unit = vndunit(dev);
207 	struct vnd_softc *sc;
208 	int error = 0, part, pmask;
209 	struct disklabel *lp;
210 
211 #ifdef DEBUG
212 	if (vnddebug & VDB_FOLLOW)
213 		printf("vndopen(%x, %x, %x, %p)\n", dev, flags, mode, p);
214 #endif
215 	if (unit >= numvnd)
216 		return (ENXIO);
217 	sc = &vnd_softc[unit];
218 
219 	if ((error = vndlock(sc)) != 0)
220 		return (error);
221 
222 	lp = sc->sc_dkdev.dk_label;
223 
224 	part = DISKPART(dev);
225 	pmask = (1 << part);
226 
227 	/*
228 	 * If we're initialized, check to see if there are any other
229 	 * open partitions.  If not, then it's safe to update the
230 	 * in-core disklabel.
231 	 */
232 	if ((sc->sc_flags & VNF_INITED) && (sc->sc_dkdev.dk_openmask == 0))
233 		vndgetdisklabel(dev);
234 
235 	/* Check that the partitions exists. */
236 	if (part != RAW_PART) {
237 		if (((sc->sc_flags & VNF_INITED) == 0) ||
238 		    ((part >= lp->d_npartitions) ||
239 		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
240 			error = ENXIO;
241 			goto done;
242 		}
243 	}
244 
245 	/* Prevent our unit from being unconfigured while open. */
246 	switch (mode) {
247 	case S_IFCHR:
248 		sc->sc_dkdev.dk_copenmask |= pmask;
249 		break;
250 
251 	case S_IFBLK:
252 		sc->sc_dkdev.dk_bopenmask |= pmask;
253 		break;
254 	}
255 	sc->sc_dkdev.dk_openmask =
256 	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
257 
258  done:
259 	vndunlock(sc);
260 	return (error);
261 }
262 
263 int
264 vndclose(dev, flags, mode, p)
265 	dev_t dev;
266 	int flags, mode;
267 	struct proc *p;
268 {
269 	int unit = vndunit(dev);
270 	struct vnd_softc *sc;
271 	int error = 0, part;
272 
273 #ifdef DEBUG
274 	if (vnddebug & VDB_FOLLOW)
275 		printf("vndclose(%x, %x, %x, %p)\n", dev, flags, mode, p);
276 #endif
277 
278 	if (unit >= numvnd)
279 		return (ENXIO);
280 	sc = &vnd_softc[unit];
281 
282 	if ((error = vndlock(sc)) != 0)
283 		return (error);
284 
285 	part = DISKPART(dev);
286 
287 	/* ...that much closer to allowing unconfiguration... */
288 	switch (mode) {
289 	case S_IFCHR:
290 		sc->sc_dkdev.dk_copenmask &= ~(1 << part);
291 		break;
292 
293 	case S_IFBLK:
294 		sc->sc_dkdev.dk_bopenmask &= ~(1 << part);
295 		break;
296 	}
297 	sc->sc_dkdev.dk_openmask =
298 	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
299 
300 	vndunlock(sc);
301 	return (0);
302 }
303 
304 /*
305  * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
306  */
307 void
308 vndstrategy(bp)
309 	register struct buf *bp;
310 {
311 	int unit = vndunit(bp->b_dev);
312 	struct vnd_softc *vnd = &vnd_softc[unit];
313 	struct vndbuf *nbp;
314 	struct vndxfer *vnx;
315 	int bn, bsize, resid;
316 	caddr_t addr;
317 	int sz, flags, error, wlabel;
318 	struct disklabel *lp;
319 	struct partition *pp;
320 
321 #ifdef DEBUG
322 	if (vnddebug & VDB_FOLLOW)
323 		printf("vndstrategy(%p): unit %d\n", bp, unit);
324 #endif
325 	if ((vnd->sc_flags & VNF_INITED) == 0) {
326 		bp->b_error = ENXIO;
327 		bp->b_flags |= B_ERROR;
328 		goto done;
329 	}
330 
331 	/* If it's a nil transfer, wake up the top half now. */
332 	if (bp->b_bcount == 0)
333 		goto done;
334 
335 	lp = vnd->sc_dkdev.dk_label;
336 
337 	/*
338 	 * The transfer must be a whole number of blocks.
339 	 */
340 	if ((bp->b_bcount % lp->d_secsize) != 0) {
341 		bp->b_error = EINVAL;
342 		bp->b_flags |= B_ERROR;
343 		goto done;
344 	}
345 
346 	/*
347 	 * Do bounds checking and adjust transfer.  If there's an error,
348 	 * the bounds check will flag that for us.
349 	 */
350 	wlabel = vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING);
351 	if (DISKPART(bp->b_dev) != RAW_PART)
352 		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
353 			goto done;
354 
355 	bp->b_resid = bp->b_bcount;
356 
357 	/*
358 	 * Put the block number in terms of the logical blocksize
359 	 * of the "device".
360 	 */
361 	bn = bp->b_blkno / (lp->d_secsize / DEV_BSIZE);
362 
363 	/*
364 	 * Translate the partition-relative block number to an absolute.
365 	 */
366 	if (DISKPART(bp->b_dev) != RAW_PART) {
367 		pp = &vnd->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
368 		bn += pp->p_offset;
369 	}
370 
371 	/* ...and convert to a byte offset within the file. */
372 	bn *= lp->d_secsize;
373 
374  	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
375 	addr = bp->b_data;
376 	flags = bp->b_flags | B_CALL;
377 
378 	/* Allocate a header for this transfer and link it to the buffer */
379 	vnx = getvndxfer();
380 	vnx->vx_error = 0;
381 	vnx->vx_pending = 0;
382 	vnx->vx_bp = bp;
383 
384 	for (resid = bp->b_resid; resid; resid -= sz) {
385 		struct vnode *vp;
386 		daddr_t nbn;
387 		int off, s, nra;
388 
389 		nra = 0;
390 		VOP_LOCK(vnd->sc_vp);
391 		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
392 		VOP_UNLOCK(vnd->sc_vp);
393 
394 		if (error == 0 && (long)nbn == -1)
395 			error = EIO;
396 
397 		/*
398 		 * If there was an error or a hole in the file...punt.
399 		 * Note that we may have to wait for any operations
400 		 * that we have already fired off before releasing
401 		 * the buffer.
402 		 *
403 		 * XXX we could deal with holes here but it would be
404 		 * a hassle (in the write case).
405 		 */
406 		if (error) {
407 			vnx->vx_error = error;
408 			s = splbio();
409 			if (vnx->vx_pending == 0) {
410 				bp->b_error = error;
411 				bp->b_flags |= B_ERROR;
412 				putvndxfer(vnx);
413 				goto done;
414 			}
415 			splx(s);
416 			return;
417 		}
418 
419 #ifdef DEBUG
420 		if (!dovndcluster)
421 			nra = 0;
422 #endif
423 
424 		if ((off = bn % bsize) != 0)
425 			sz = bsize - off;
426 		else
427 			sz = (1 + nra) * bsize;
428 		if (resid < sz)
429 			sz = resid;
430 #ifdef DEBUG
431 		if (vnddebug & VDB_IO)
432 			printf("vndstrategy: vp %p/%p bn %x/%x sz %x\n",
433 			    vnd->sc_vp, vp, bn, nbn, sz);
434 #endif
435 
436 		nbp = getvndbuf();
437 		nbp->vb_buf.b_flags = flags;
438 		nbp->vb_buf.b_bcount = sz;
439 		nbp->vb_buf.b_bufsize = bp->b_bufsize;
440 		nbp->vb_buf.b_error = 0;
441 		if (vp->v_type == VBLK || vp->v_type == VCHR)
442 			nbp->vb_buf.b_dev = vp->v_rdev;
443 		else
444 			nbp->vb_buf.b_dev = NODEV;
445 		nbp->vb_buf.b_data = addr;
446 		nbp->vb_buf.b_blkno = nbn + btodb(off);
447 		nbp->vb_buf.b_proc = bp->b_proc;
448 		nbp->vb_buf.b_iodone = vndiodone;
449 		nbp->vb_buf.b_vp = vp;
450 		nbp->vb_buf.b_rcred = vnd->sc_cred;	/* XXX crdup? */
451 		nbp->vb_buf.b_wcred = vnd->sc_cred;	/* XXX crdup? */
452 		if (bp->b_dirtyend == 0) {
453 			nbp->vb_buf.b_dirtyoff = 0;
454 			nbp->vb_buf.b_dirtyend = sz;
455 		} else {
456 			nbp->vb_buf.b_dirtyoff =
457 			    max(0, bp->b_dirtyoff - (bp->b_bcount - resid));
458 			nbp->vb_buf.b_dirtyend =
459 			    min(sz,
460 				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
461 		}
462 		if (bp->b_validend == 0) {
463 			nbp->vb_buf.b_validoff = 0;
464 			nbp->vb_buf.b_validend = sz;
465 		} else {
466 			nbp->vb_buf.b_validoff =
467 			    max(0, bp->b_validoff - (bp->b_bcount - resid));
468 			nbp->vb_buf.b_validend =
469 			    min(sz,
470 				max(0, bp->b_validend - (bp->b_bcount-resid)));
471 		}
472 
473 		nbp->vb_xfer = vnx;
474 
475 		/*
476 		 * Just sort by block number
477 		 */
478 		nbp->vb_buf.b_cylin = nbp->vb_buf.b_blkno;
479 		s = splbio();
480 		vnx->vx_pending++;
481 		disksort(&vnd->sc_tab, &nbp->vb_buf);
482 		if (vnd->sc_tab.b_active < vnd->sc_maxactive) {
483 			vnd->sc_tab.b_active++;
484 			vndstart(vnd);
485 		}
486 		splx(s);
487 		bn += sz;
488 		addr += sz;
489 	}
490 	return;
491 
492  done:
493 	biodone(bp);
494 }
495 
496 /*
497  * Feed requests sequentially.
498  * We do it this way to keep from flooding NFS servers if we are connected
499  * to an NFS file.  This places the burden on the client rather than the
500  * server.
501  */
502 void
503 vndstart(vnd)
504 	register struct vnd_softc *vnd;
505 {
506 	register struct buf *bp;
507 
508 	/*
509 	 * Dequeue now since lower level strategy routine might
510 	 * queue using same links
511 	 */
512 	bp = vnd->sc_tab.b_actf;
513 	vnd->sc_tab.b_actf = bp->b_actf;
514 #ifdef DEBUG
515 	if (vnddebug & VDB_IO)
516 		printf("vndstart(%ld): bp %p vp %p blkno %x addr %p cnt %lx\n",
517 		    (long) (vnd-vnd_softc), bp, bp->b_vp, bp->b_blkno,
518 		    bp->b_data, bp->b_bcount);
519 #endif
520 
521 	/* Instrumentation. */
522 	disk_busy(&vnd->sc_dkdev);
523 
524 	if ((bp->b_flags & B_READ) == 0)
525 		bp->b_vp->v_numoutput++;
526 	VOP_STRATEGY(bp);
527 }
528 
529 void
530 vndiodone(bp)
531 	struct buf *bp;
532 {
533 	register struct vndbuf *vbp = (struct vndbuf *) bp;
534 	register struct vndxfer *vnx = (struct vndxfer *)vbp->vb_xfer;
535 	register struct buf *pbp = vnx->vx_bp;
536 	register struct vnd_softc *vnd = &vnd_softc[vndunit(pbp->b_dev)];
537 	int s, resid;
538 
539 	s = splbio();
540 #ifdef DEBUG
541 	if (vnddebug & VDB_IO)
542 		printf("vndiodone(%ld): vbp %p vp %p blkno %x addr %p cnt %lx\n",
543 		    (long) (vnd-vnd_softc), vbp, vbp->vb_buf.b_vp,
544 		    vbp->vb_buf.b_blkno, vbp->vb_buf.b_data,
545 		    vbp->vb_buf.b_bcount);
546 #endif
547 
548 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
549 	pbp->b_resid -= resid;
550 	disk_unbusy(&vnd->sc_dkdev, resid);
551 	vnx->vx_pending--;
552 
553 	if (vbp->vb_buf.b_error) {
554 #ifdef DEBUG
555 		if (vnddebug & VDB_IO)
556 			printf("vndiodone: vbp %p error %d\n", vbp,
557 			    vbp->vb_buf.b_error);
558 #endif
559 		vnx->vx_error = vbp->vb_buf.b_error;
560 	}
561 	putvndbuf(vbp);
562 
563 	/*
564 	 * Wrap up this transaction if it has run to completion or, in
565 	 * case of an error, when all auxiliary buffers have returned.
566 	 */
567 	if (pbp->b_resid == 0 || (vnx->vx_error && vnx->vx_pending == 0)) {
568 
569 		if (vnx->vx_error != 0) {
570 			pbp->b_flags |= B_ERROR;
571 			pbp->b_error = vnx->vx_error;
572 		}
573 		putvndxfer(vnx);
574 #ifdef DEBUG
575 		if (vnddebug & VDB_IO)
576 			printf("vndiodone: pbp %p iodone\n", pbp);
577 #endif
578 		biodone(pbp);
579 	}
580 
581 	if (vnd->sc_tab.b_actf)
582 		vndstart(vnd);
583 	else
584 		vnd->sc_tab.b_active--;
585 	splx(s);
586 }
587 
588 /* ARGSUSED */
589 int
590 vndread(dev, uio, flags)
591 	dev_t dev;
592 	struct uio *uio;
593 	int flags;
594 {
595 	int unit = vndunit(dev);
596 	struct vnd_softc *sc;
597 
598 #ifdef DEBUG
599 	if (vnddebug & VDB_FOLLOW)
600 		printf("vndread(%x, %p)\n", dev, uio);
601 #endif
602 
603 	if (unit >= numvnd)
604 		return (ENXIO);
605 	sc = &vnd_softc[unit];
606 
607 	if ((sc->sc_flags & VNF_INITED) == 0)
608 		return (ENXIO);
609 
610 	return (physio(vndstrategy, NULL, dev, B_READ, minphys, uio));
611 }
612 
613 /* ARGSUSED */
614 int
615 vndwrite(dev, uio, flags)
616 	dev_t dev;
617 	struct uio *uio;
618 	int flags;
619 {
620 	int unit = vndunit(dev);
621 	struct vnd_softc *sc;
622 
623 #ifdef DEBUG
624 	if (vnddebug & VDB_FOLLOW)
625 		printf("vndwrite(%x, %p)\n", dev, uio);
626 #endif
627 
628 	if (unit >= numvnd)
629 		return (ENXIO);
630 	sc = &vnd_softc[unit];
631 
632 	if ((sc->sc_flags & VNF_INITED) == 0)
633 		return (ENXIO);
634 
635 	return (physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio));
636 }
637 
638 /* ARGSUSED */
639 int
640 vndioctl(dev, cmd, data, flag, p)
641 	dev_t dev;
642 	u_long cmd;
643 	caddr_t data;
644 	int flag;
645 	struct proc *p;
646 {
647 	int unit = vndunit(dev);
648 	register struct vnd_softc *vnd;
649 	struct vnd_ioctl *vio;
650 	struct vattr vattr;
651 	struct nameidata nd;
652 	int error, part, pmask;
653 	size_t geomsize;
654 
655 #ifdef DEBUG
656 	if (vnddebug & VDB_FOLLOW)
657 		printf("vndioctl(%x, %lx, %p, %x, %p): unit %d\n",
658 		    dev, cmd, data, flag, p, unit);
659 #endif
660 	error = suser(p->p_ucred, &p->p_acflag);
661 	if (error)
662 		return (error);
663 	if (unit >= numvnd)
664 		return (ENXIO);
665 
666 	vnd = &vnd_softc[unit];
667 	vio = (struct vnd_ioctl *)data;
668 
669 	/* Must be open for writes for these commands... */
670 	switch (cmd) {
671 	case VNDIOCSET:
672 	case VNDIOCCLR:
673 	case DIOCSDINFO:
674 	case DIOCWDINFO:
675 	case DIOCWLABEL:
676 		if ((flag & FWRITE) == 0)
677 			return (EBADF);
678 	}
679 
680 	/* Must be initialized for these... */
681 	switch (cmd) {
682 	case VNDIOCCLR:
683 	case DIOCGDINFO:
684 	case DIOCSDINFO:
685 	case DIOCWDINFO:
686 	case DIOCGPART:
687 	case DIOCWLABEL:
688 		if ((vnd->sc_flags & VNF_INITED) == 0)
689 			return (ENXIO);
690 	}
691 
692 	switch (cmd) {
693 	case VNDIOCSET:
694 		if (vnd->sc_flags & VNF_INITED)
695 			return (EBUSY);
696 
697 		if ((error = vndlock(vnd)) != 0)
698 			return (error);
699 
700 		/*
701 		 * Always open for read and write.
702 		 * This is probably bogus, but it lets vn_open()
703 		 * weed out directories, sockets, etc. so we don't
704 		 * have to worry about them.
705 		 */
706 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vnd_file, p);
707 		if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
708 			vndunlock(vnd);
709 			return(error);
710 		}
711 		error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p);
712 		if (error) {
713 			VOP_UNLOCK(nd.ni_vp);
714 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
715 			vndunlock(vnd);
716 			return(error);
717 		}
718 		VOP_UNLOCK(nd.ni_vp);
719 		vnd->sc_vp = nd.ni_vp;
720 		vnd->sc_size = btodb(vattr.va_size);	/* note truncation */
721 
722 		/*
723 		 * Use pseudo-geometry specified.  If none was provided,
724 		 * use "standard" Adaptec fictitious geometry.
725 		 */
726 		if (vio->vnd_flags & VNDIOF_HASGEOM) {
727 
728 			bcopy(&vio->vnd_geom, &vnd->sc_geom,
729 			    sizeof(vio->vnd_geom));
730 
731 			/*
732 			 * Sanity-check the sector size.
733 			 * XXX Don't allow secsize < DEV_BSIZE.  Should
734 			 * XXX we?
735 			 */
736 			if (vnd->sc_geom.vng_secsize < DEV_BSIZE ||
737 			    (vnd->sc_geom.vng_secsize % DEV_BSIZE) != 0) {
738 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
739 				    p->p_ucred, p);
740 				vndunlock(vnd);
741 				return (EINVAL);
742 			}
743 
744 			/*
745 			 * Compute the size (in DEV_BSIZE blocks) specified
746 			 * by the geometry.
747 			 */
748 			geomsize = (vnd->sc_geom.vng_nsectors *
749 			    vnd->sc_geom.vng_ntracks *
750 			    vnd->sc_geom.vng_ncylinders) *
751 			    (vnd->sc_geom.vng_secsize / DEV_BSIZE);
752 
753 			/*
754 			 * Sanity-check the size against the specified
755 			 * geometry.
756 			 */
757 			if (vnd->sc_size < geomsize) {
758 				(void) vn_close(nd.ni_vp, FREAD|FWRITE,
759 				    p->p_ucred, p);
760 				vndunlock(vnd);
761 				return (EINVAL);
762 			}
763 		} else {
764 			/*
765 			 * Size must be at least 2048 DEV_BSIZE blocks
766 			 * (1M) in order to use this geometry.
767 			 */
768 			if (vnd->sc_size < (32 * 64))
769 				return (EINVAL);
770 
771 			vnd->sc_geom.vng_secsize = DEV_BSIZE;
772 			vnd->sc_geom.vng_nsectors = 32;
773 			vnd->sc_geom.vng_ntracks = 64;
774 			vnd->sc_geom.vng_ncylinders = vnd->sc_size / (64 * 32);
775 
776 			/*
777 			 * Compute the actual size allowed by this geometry.
778 			 */
779 			geomsize = 32 * 64 * vnd->sc_geom.vng_ncylinders;
780 		}
781 
782 		/*
783 		 * Truncate the size to that specified by
784 		 * the geometry.
785 		 * XXX Should we even bother with this?
786 		 */
787 		vnd->sc_size = geomsize;
788 
789 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0) {
790 			(void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p);
791 			vndunlock(vnd);
792 			return(error);
793 		}
794 		vndthrottle(vnd, vnd->sc_vp);
795 		vio->vnd_size = dbtob(vnd->sc_size);
796 		vnd->sc_flags |= VNF_INITED;
797 #ifdef DEBUG
798 		if (vnddebug & VDB_INIT)
799 			printf("vndioctl: SET vp %p size %lx %d/%d/%d/%d\n",
800 			    vnd->sc_vp, (unsigned long) vnd->sc_size,
801 			    vnd->sc_geom.vng_secsize,
802 			    vnd->sc_geom.vng_nsectors,
803 			    vnd->sc_geom.vng_ntracks,
804 			    vnd->sc_geom.vng_ncylinders);
805 #endif
806 
807 		/* Attach the disk. */
808 		bzero(vnd->sc_xname, sizeof(vnd->sc_xname));	/* XXX */
809 		sprintf(vnd->sc_xname, "vnd%d", unit);		/* XXX */
810 		vnd->sc_dkdev.dk_name = vnd->sc_xname;
811 		disk_attach(&vnd->sc_dkdev);
812 
813 		/* Try and read the disklabel. */
814 		vndgetdisklabel(dev);
815 
816 		vndunlock(vnd);
817 
818 		break;
819 
820 	case VNDIOCCLR:
821 		if ((error = vndlock(vnd)) != 0)
822 			return (error);
823 
824 		/*
825 		 * Don't unconfigure if any other partitions are open
826 		 * or if both the character and block flavors of this
827 		 * partition are open.
828 		 */
829 		part = DISKPART(dev);
830 		pmask = (1 << part);
831 		if ((vnd->sc_dkdev.dk_openmask & ~pmask) ||
832 		    ((vnd->sc_dkdev.dk_bopenmask & pmask) &&
833 		    (vnd->sc_dkdev.dk_copenmask & pmask))) {
834 			vndunlock(vnd);
835 			return (EBUSY);
836 		}
837 
838 		vndclear(vnd);
839 #ifdef DEBUG
840 		if (vnddebug & VDB_INIT)
841 			printf("vndioctl: CLRed\n");
842 #endif
843 
844 		/* Detatch the disk. */
845 		disk_detach(&vnd->sc_dkdev);
846 
847 		vndunlock(vnd);
848 
849 		break;
850 
851 	case DIOCGDINFO:
852 		*(struct disklabel *)data = *(vnd->sc_dkdev.dk_label);
853 		break;
854 
855 	case DIOCGPART:
856 		((struct partinfo *)data)->disklab = vnd->sc_dkdev.dk_label;
857 		((struct partinfo *)data)->part =
858 		    &vnd->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
859 		break;
860 
861 	case DIOCWDINFO:
862 	case DIOCSDINFO:
863 		if ((error = vndlock(vnd)) != 0)
864 			return (error);
865 
866 		vnd->sc_flags |= VNF_LABELLING;
867 
868 		error = setdisklabel(vnd->sc_dkdev.dk_label,
869 		    (struct disklabel *)data, 0, vnd->sc_dkdev.dk_cpulabel);
870 		if (error == 0) {
871 			if (cmd == DIOCWDINFO)
872 				error = writedisklabel(VNDLABELDEV(dev),
873 				    vndstrategy, vnd->sc_dkdev.dk_label,
874 				    vnd->sc_dkdev.dk_cpulabel);
875 		}
876 
877 		vnd->sc_flags &= ~VNF_LABELLING;
878 
879 		vndunlock(vnd);
880 
881 		if (error)
882 			return (error);
883 		break;
884 
885 	case DIOCWLABEL:
886 		if (*(int *)data != 0)
887 			vnd->sc_flags |= VNF_WLABEL;
888 		else
889 			vnd->sc_flags &= ~VNF_WLABEL;
890 		break;
891 
892 	default:
893 		return (ENOTTY);
894 	}
895 
896 	return (0);
897 }
898 
899 /*
900  * Duplicate the current processes' credentials.  Since we are called only
901  * as the result of a SET ioctl and only root can do that, any future access
902  * to this "disk" is essentially as root.  Note that credentials may change
903  * if some other uid can write directly to the mapped file (NFS).
904  */
905 int
906 vndsetcred(vnd, cred)
907 	register struct vnd_softc *vnd;
908 	struct ucred *cred;
909 {
910 	struct uio auio;
911 	struct iovec aiov;
912 	char *tmpbuf;
913 	int error;
914 
915 	vnd->sc_cred = crdup(cred);
916 	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
917 
918 	/* XXX: Horrible kludge to establish credentials for NFS */
919 	aiov.iov_base = tmpbuf;
920 	aiov.iov_len = min(DEV_BSIZE, dbtob(vnd->sc_size));
921 	auio.uio_iov = &aiov;
922 	auio.uio_iovcnt = 1;
923 	auio.uio_offset = 0;
924 	auio.uio_rw = UIO_READ;
925 	auio.uio_segflg = UIO_SYSSPACE;
926 	auio.uio_resid = aiov.iov_len;
927 	VOP_LOCK(vnd->sc_vp);
928 	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
929 	VOP_UNLOCK(vnd->sc_vp);
930 
931 	free(tmpbuf, M_TEMP);
932 	return (error);
933 }
934 
935 /*
936  * Set maxactive based on FS type
937  */
938 void
939 vndthrottle(vnd, vp)
940 	register struct vnd_softc *vnd;
941 	struct vnode *vp;
942 {
943 #ifdef NFS
944 	extern int (**nfsv2_vnodeop_p) __P((void *));
945 
946 	if (vp->v_op == nfsv2_vnodeop_p)
947 		vnd->sc_maxactive = 2;
948 	else
949 #endif
950 		vnd->sc_maxactive = 8;
951 
952 	if (vnd->sc_maxactive < 1)
953 		vnd->sc_maxactive = 1;
954 }
955 
956 void
957 vndshutdown()
958 {
959 	register struct vnd_softc *vnd;
960 
961 	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
962 		if (vnd->sc_flags & VNF_INITED)
963 			vndclear(vnd);
964 }
965 
966 void
967 vndclear(vnd)
968 	register struct vnd_softc *vnd;
969 {
970 	register struct vnode *vp = vnd->sc_vp;
971 	struct proc *p = curproc;		/* XXX */
972 
973 #ifdef DEBUG
974 	if (vnddebug & VDB_FOLLOW)
975 		printf("vndclear(%p): vp %p\n", vnd, vp);
976 #endif
977 	vnd->sc_flags &= ~VNF_INITED;
978 	if (vp == (struct vnode *)0)
979 		panic("vndioctl: null vp");
980 	(void) vn_close(vp, FREAD|FWRITE, vnd->sc_cred, p);
981 	crfree(vnd->sc_cred);
982 	vnd->sc_vp = (struct vnode *)0;
983 	vnd->sc_cred = (struct ucred *)0;
984 	vnd->sc_size = 0;
985 }
986 
987 int
988 vndsize(dev)
989 	dev_t dev;
990 {
991 	struct vnd_softc *sc;
992 	struct disklabel *lp;
993 	int part, unit, omask;
994 	int size;
995 
996 	unit = vndunit(dev);
997 	if (unit >= numvnd)
998 		return (-1);
999 	sc = &vnd_softc[unit];
1000 
1001 	if ((sc->sc_flags & VNF_INITED) == 0)
1002 		return (-1);
1003 
1004 	part = DISKPART(dev);
1005 	omask = sc->sc_dkdev.dk_openmask & (1 << part);
1006 	lp = sc->sc_dkdev.dk_label;
1007 
1008 	if (omask == 0 && vndopen(dev, 0, S_IFBLK, curproc))
1009 		return (-1);
1010 
1011 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1012 		size = -1;
1013 	else
1014 		size = lp->d_partitions[part].p_size *
1015 		    (lp->d_secsize / DEV_BSIZE);
1016 
1017 	if (omask == 0 && vndclose(dev, 0, S_IFBLK, curproc))
1018 		return (-1);
1019 
1020 	return (size);
1021 }
1022 
1023 int
1024 vnddump(dev, blkno, va, size)
1025 	dev_t dev;
1026 	daddr_t blkno;
1027 	caddr_t va;
1028 	size_t size;
1029 {
1030 
1031 	/* Not implemented. */
1032 	return ENXIO;
1033 }
1034 
1035 /*
1036  * Read the disklabel from a vnd.  If one is not present, create a fake one.
1037  */
1038 void
1039 vndgetdisklabel(dev)
1040 	dev_t dev;
1041 {
1042 	struct vnd_softc *sc = &vnd_softc[vndunit(dev)];
1043 	char *errstring;
1044 	struct disklabel *lp = sc->sc_dkdev.dk_label;
1045 	struct cpu_disklabel *clp = sc->sc_dkdev.dk_cpulabel;
1046 	struct vndgeom *vng = &sc->sc_geom;
1047 	struct partition *pp;
1048 	int i;
1049 
1050 	bzero(lp, sizeof(*lp));
1051 	bzero(clp, sizeof(*clp));
1052 
1053 	lp->d_secperunit = sc->sc_size / (vng->vng_secsize / DEV_BSIZE);
1054 	lp->d_secsize = vng->vng_secsize;
1055 	lp->d_nsectors = vng->vng_nsectors;
1056 	lp->d_ntracks = vng->vng_ntracks;
1057 	lp->d_ncylinders = vng->vng_ncylinders;
1058 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1059 
1060 	strncpy(lp->d_typename, "vnd", sizeof(lp->d_typename));
1061 	lp->d_type = DTYPE_VND;
1062 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1063 	lp->d_rpm = 3600;
1064 	lp->d_interleave = 1;
1065 	lp->d_flags = 0;
1066 
1067 	pp = &lp->d_partitions[RAW_PART];
1068 	pp->p_offset = 0;
1069 	pp->p_size = lp->d_secperunit;
1070 	pp->p_fstype = FS_UNUSED;
1071 	lp->d_npartitions = RAW_PART + 1;
1072 
1073 	lp->d_magic = DISKMAGIC;
1074 	lp->d_magic2 = DISKMAGIC;
1075 	lp->d_checksum = dkcksum(lp);
1076 
1077 	/*
1078 	 * Call the generic disklabel extraction routine.
1079 	 */
1080 	errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, clp);
1081 	if (errstring) {
1082 		/*
1083 		 * Lack of disklabel is common, but we print the warning
1084 		 * anyway, since it might contain other useful information.
1085 		 */
1086 		printf("%s: %s\n", sc->sc_xname, errstring);
1087 
1088 		/*
1089 		 * For historical reasons, if there's no disklabel
1090 		 * present, all partitions must be FS_BSDFFS and
1091 		 * occupy the entire disk.
1092 		 */
1093 		for (i = 0; i < MAXPARTITIONS; i++) {
1094 			lp->d_partitions[i].p_size = lp->d_secperunit;
1095 			lp->d_partitions[i].p_offset = 0;
1096 			lp->d_partitions[i].p_fstype = FS_BSDFFS;
1097 		}
1098 
1099 		strncpy(lp->d_packname, "default label",
1100 		    sizeof(lp->d_packname));
1101 	}
1102 }
1103 
1104 /*
1105  * Wait interruptibly for an exclusive lock.
1106  *
1107  * XXX
1108  * Several drivers do this; it should be abstracted and made MP-safe.
1109  */
1110 static int
1111 vndlock(sc)
1112 	struct vnd_softc *sc;
1113 {
1114 	int error;
1115 
1116 	while ((sc->sc_flags & VNF_LOCKED) != 0) {
1117 		sc->sc_flags |= VNF_WANTED;
1118 		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
1119 			return (error);
1120 	}
1121 	sc->sc_flags |= VNF_LOCKED;
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Unlock and wake up any waiters.
1127  */
1128 static void
1129 vndunlock(sc)
1130 	struct vnd_softc *sc;
1131 {
1132 
1133 	sc->sc_flags &= ~VNF_LOCKED;
1134 	if ((sc->sc_flags & VNF_WANTED) != 0) {
1135 		sc->sc_flags &= ~VNF_WANTED;
1136 		wakeup(sc);
1137 	}
1138 }
1139