xref: /dflybsd-src/sys/dev/disk/ccd/ccd.c (revision d2d1103f52e6fb116ee65a9940477c5449933f28)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
35  */
36 /*
37  * Copyright (c) 1995 Jason R. Thorpe.
38  * All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed for the NetBSD Project
51  *	by Jason R. Thorpe.
52  * 4. The name of the author may not be used to endorse or promote products
53  *    derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  * Copyright (c) 1990, 1993
71  *	The Regents of the University of California.  All rights reserved.
72  *
73  * This code is derived from software contributed to Berkeley by
74  * the Systems Programming Group of the University of Utah Computer
75  * Science Department.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  * 3. All advertising materials mentioning features or use of this software
86  *    must display the following acknowledgement:
87  *	This product includes software developed by the University of
88  *	California, Berkeley and its contributors.
89  * 4. Neither the name of the University nor the names of its contributors
90  *    may be used to endorse or promote products derived from this software
91  *    without specific prior written permission.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
103  * SUCH DAMAGE.
104  *
105  * from: Utah $Hdr: cd.c 1.6 90/11/28$
106  */
107 /*
108  * @(#)cd.c	8.2 (Berkeley) 11/16/93
109  * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
112  */
113 
114 /*
115  * "Concatenated" disk driver.
116  *
117  * Original dynamic configuration support by:
118  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
119  *	Numerical Aerodynamic Simulation Facility
120  *	Mail Stop 258-6
121  *	NASA Ames Research Center
122  *	Moffett Field, CA 94035
123  */
124 
125 #include "use_ccd.h"
126 
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
132 #include <sys/buf.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/ccdvar.h>
145 
146 #include <vm/vm_zone.h>
147 
148 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
149 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
150 
151 #include <sys/thread2.h>
152 #include <sys/buf2.h>
153 #include <sys/mplock2.h>
154 
155 #if defined(CCDDEBUG) && !defined(DEBUG)
156 #define DEBUG
157 #endif
158 
159 #ifdef DEBUG
160 #define CCDB_FOLLOW	0x01
161 #define CCDB_INIT	0x02
162 #define CCDB_IO		0x04
163 #define CCDB_LABEL	0x08
164 #define CCDB_VNODE	0x10
165 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
166     CCDB_VNODE;
167 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
168 #undef DEBUG
169 #endif
170 
171 #define	ccdunit(x)	dkunit(x)
172 #define ccdpart(x)	dkpart(x)
173 
174 /*
175    This is how mirroring works (only writes are special):
176 
177    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
178    linked together by the cb_mirror field.  "cb_pflags &
179    CCDPF_MIRROR_DONE" is set to 0 on both of them.
180 
181    When a component returns to ccdiodone(), it checks if "cb_pflags &
182    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
183    flag and returns.  If it is, it means its partner has already
184    returned, so it will go to the regular cleanup.
185 
186  */
187 
188 struct ccdbuf {
189 	struct buf	cb_buf;		/* new I/O buf */
190 	struct vnode	*cb_vp;		/* related vnode */
191 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
192 	struct ccdbuf	*cb_freenext;	/* free list link */
193 	int		cb_unit;	/* target unit */
194 	int		cb_comp;	/* target component */
195 	int		cb_pflags;	/* mirror/parity status flag */
196 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
197 };
198 
199 /* bits in cb_pflags */
200 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
201 
202 static d_open_t ccdopen;
203 static d_close_t ccdclose;
204 static d_strategy_t ccdstrategy;
205 static d_ioctl_t ccdioctl;
206 static d_dump_t ccddump;
207 
208 #define NCCDFREEHIWAT	16
209 
210 #define CDEV_MAJOR 74
211 
212 static struct dev_ops ccd_ops = {
213 	{ "ccd", CDEV_MAJOR, D_DISK },
214 	.d_open =	ccdopen,
215 	.d_close =	ccdclose,
216 	.d_read =	physread,
217 	.d_write =	physwrite,
218 	.d_ioctl =	ccdioctl,
219 	.d_strategy =	ccdstrategy,
220 	.d_dump =	ccddump
221 };
222 
223 /* called during module initialization */
224 static	void ccdattach (void);
225 static	int ccddetach (void);
226 static	int ccd_modevent (module_t, int, void *);
227 
228 /* called by biodone() at interrupt time */
229 static	void ccdiodone (struct bio *bio);
230 
231 static	void ccdstart (struct ccd_softc *, struct bio *);
232 static	void ccdinterleave (struct ccd_softc *, int);
233 static	void ccdintr (struct ccd_softc *, struct bio *);
234 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
235 static	int ccdlookup (char *, struct vnode **);
236 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
237 		struct bio *, off_t, caddr_t, long);
238 static	int ccdlock (struct ccd_softc *);
239 static	void ccdunlock (struct ccd_softc *);
240 
241 #ifdef DEBUG
242 static	void printiinfo (struct ccdiinfo *);
243 #endif
244 
245 /* Non-private for the benefit of libkvm. */
246 struct	ccd_softc *ccd_softc;
247 struct	ccddevice *ccddevs;
248 struct	ccdbuf *ccdfreebufs;
249 static	int numccdfreebufs;
250 static	int numccd = 0;
251 
252 /*
253  * getccdbuf() -	Allocate and zero a ccd buffer.
254  *
255  *	This routine is called at splbio().
256  */
257 
258 static __inline
259 struct ccdbuf *
260 getccdbuf(void)
261 {
262 	struct ccdbuf *cbp;
263 
264 	/*
265 	 * Allocate from freelist or malloc as necessary
266 	 */
267 	if ((cbp = ccdfreebufs) != NULL) {
268 		ccdfreebufs = cbp->cb_freenext;
269 		--numccdfreebufs;
270 		reinitbufbio(&cbp->cb_buf);
271 	} else {
272 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
273 		initbufbio(&cbp->cb_buf);
274 	}
275 
276 	/*
277 	 * independant struct buf initialization
278 	 */
279 	buf_dep_init(&cbp->cb_buf);
280 	BUF_LOCKINIT(&cbp->cb_buf);
281 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
282 	BUF_KERNPROC(&cbp->cb_buf);
283 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
284 
285 	return(cbp);
286 }
287 
288 /*
289  * putccdbuf() -	Free a ccd buffer.
290  *
291  *	This routine is called at splbio().
292  */
293 
294 static __inline
295 void
296 putccdbuf(struct ccdbuf *cbp)
297 {
298 	BUF_UNLOCK(&cbp->cb_buf);
299 	BUF_LOCKFREE(&cbp->cb_buf);
300 
301 	if (numccdfreebufs < NCCDFREEHIWAT) {
302 		cbp->cb_freenext = ccdfreebufs;
303 		ccdfreebufs = cbp;
304 		++numccdfreebufs;
305 	} else {
306 		kfree((caddr_t)cbp, M_DEVBUF);
307 	}
308 }
309 
310 /*
311  * Called by main() during pseudo-device attachment.  All we need
312  * to do is allocate enough space for devices to be configured later, and
313  * add devsw entries.
314  */
315 static void
316 ccdattach(void)
317 {
318 	struct disk_info info;
319 	struct ccd_softc *cs;
320 	int i;
321 	int num = NCCD;
322 
323 	if (num > 1)
324 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
325 	else
326 		kprintf("ccd0: Concatenated disk driver\n");
327 
328 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
329 			    M_WAITOK | M_ZERO);
330 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
331 			    M_WAITOK | M_ZERO);
332 	numccd = num;
333 
334 	/*
335 	 * With normal disk devices the open simply fails if the media
336 	 * is not present.  With CCD we have to be able to open the
337 	 * raw disk to use the ioctl's to set it up, so create a dummy
338 	 * disk info structure so dscheck() doesn't blow up.
339 	 */
340 	bzero(&info, sizeof(info));
341 	info.d_media_blksize = DEV_BSIZE;
342 
343 	for (i = 0; i < numccd; ++i) {
344 		cs = &ccd_softc[i];
345 		cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
346 		cs->sc_dev->si_drv1 = cs;
347 		cs->sc_dev->si_iosize_max = 256 * 512;	/* XXX */
348 		disk_setdiskinfo(&cs->sc_disk, &info);
349 	}
350 }
351 
352 static int
353 ccddetach(void)
354 {
355 	struct ccd_softc *cs;
356 	struct dev_ioctl_args ioctl_args;
357 	int i;
358 	int error = 0;
359 	int eval;
360 
361 	bzero(&ioctl_args, sizeof(ioctl_args));
362 
363 	for (i = 0; i < numccd; ++i) {
364 		cs = &ccd_softc[i];
365 		if (cs->sc_dev == NULL)
366 			continue;
367 		ioctl_args.a_head.a_dev = cs->sc_dev;
368 		ioctl_args.a_cmd = CCDIOCCLR;
369 		ioctl_args.a_fflag = FWRITE;
370 		eval = ccdioctl(&ioctl_args);
371 		if (eval && eval != ENXIO) {
372 			kprintf("ccd%d: In use, cannot detach\n", i);
373 			error = EBUSY;
374 		}
375 	}
376 	if (error == 0) {
377 		for (i = 0; i < numccd; ++i) {
378 			cs = &ccd_softc[i];
379 			if (cs->sc_dev == NULL)
380 				continue;
381 			disk_destroy(&cs->sc_disk);
382 			cs->sc_dev = NULL;
383 		}
384 		if (ccd_softc)
385 			kfree(ccd_softc, M_DEVBUF);
386 		if (ccddevs)
387 			kfree(ccddevs, M_DEVBUF);
388 	}
389 	return (error);
390 }
391 
392 static int
393 ccd_modevent(module_t mod, int type, void *data)
394 {
395 	int error = 0;
396 
397 	switch (type) {
398 	case MOD_LOAD:
399 		ccdattach();
400 		break;
401 
402 	case MOD_UNLOAD:
403 		error = ccddetach();
404 		break;
405 
406 	default:	/* MOD_SHUTDOWN etc */
407 		break;
408 	}
409 	return (error);
410 }
411 
412 DEV_MODULE(ccd, ccd_modevent, NULL);
413 
414 static int
415 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
416 {
417 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
418 	struct ccdcinfo *ci = NULL;	/* XXX */
419 	int ix;
420 	struct vnode *vp;
421 	u_int64_t skip;
422 	u_int64_t size;
423 	u_int64_t minsize;
424 	int maxsecsize;
425 	struct partinfo dpart;
426 	struct ccdgeom *ccg = &cs->sc_geom;
427 	char tmppath[MAXPATHLEN];
428 	int error = 0;
429 
430 #ifdef DEBUG
431 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
432 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
433 #endif
434 
435 	cs->sc_size = 0;
436 	cs->sc_ileave = ccd->ccd_interleave;
437 	cs->sc_nccdisks = ccd->ccd_ndev;
438 
439 	/* Allocate space for the component info. */
440 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
441 				M_DEVBUF, M_WAITOK);
442 	cs->sc_maxiosize = MAXPHYS;
443 
444 	/*
445 	 * Verify that each component piece exists and record
446 	 * relevant information about it.
447 	 */
448 	maxsecsize = 0;
449 	minsize = 0;
450 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
451 		vp = ccd->ccd_vpp[ix];
452 		ci = &cs->sc_cinfo[ix];
453 		ci->ci_vp = vp;
454 
455 		/*
456 		 * Copy in the pathname of the component.
457 		 */
458 		bzero(tmppath, sizeof(tmppath));	/* sanity */
459 		if ((error = copyinstr(cpaths[ix], tmppath,
460 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
461 #ifdef DEBUG
462 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
463 				kprintf("ccd%d: can't copy path, error = %d\n",
464 				    ccd->ccd_unit, error);
465 #endif
466 			goto fail;
467 		}
468 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
469 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
470 
471 		ci->ci_dev = vn_todev(vp);
472 		if (ci->ci_dev->si_iosize_max &&
473 		    cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
474 			cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
475 		}
476 
477 		/*
478 		 * Get partition information for the component.
479 		 */
480 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD,
481 				  cred, NULL);
482 		if (error) {
483 #ifdef DEBUG
484 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
485 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
486 				     ccd->ccd_unit, ci->ci_path, error);
487 #endif
488 			goto fail;
489 		}
490 		if (dpart.fstype != FS_CCD &&
491 		    !kuuid_is_ccd(&dpart.fstype_uuid)) {
492 			kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
493 				ccd->ccd_unit, ci->ci_path);
494 			error = EFTYPE;
495 			goto fail;
496 		}
497 		if (maxsecsize < dpart.media_blksize)
498 			maxsecsize = dpart.media_blksize;
499 
500 		/*
501 		 * Skip a certain amount of storage at the beginning of
502 		 * the component to make sure we don't infringe on any
503 		 * reserved sectors.  This is handled entirely by
504 		 * dpart.reserved_blocks but we also impose a minimum
505 		 * of 16 sectors for backwards compatibility.
506 		 */
507 		skip = 16;
508 		if (skip < dpart.reserved_blocks)
509 			skip = dpart.reserved_blocks;
510 		size = dpart.media_blocks - skip;
511 
512 		/*
513 		 * Calculate the size, truncating to an interleave
514 		 * boundary if necessary.
515 		 */
516 		if (cs->sc_ileave > 1)
517 			size -= size % cs->sc_ileave;
518 
519 		if ((int64_t)size <= 0) {
520 #ifdef DEBUG
521 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
522 				kprintf("ccd%d: %s: size == 0\n",
523 				    ccd->ccd_unit, ci->ci_path);
524 #endif
525 			error = ENODEV;
526 			goto fail;
527 		}
528 
529 		/*
530 		 * Calculate the smallest uniform component, used
531 		 * elsewhere.
532 		 */
533 		if (minsize == 0 || minsize > size)
534 			minsize = size;
535 		ci->ci_skip = skip;
536 		ci->ci_size = size;
537 		cs->sc_size += size;
538 	}
539 	kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
540 		cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
541 
542 	/*
543 	 * Don't allow the interleave to be smaller than
544 	 * the biggest component sector.
545 	 */
546 	if ((cs->sc_ileave > 0) &&
547 	    (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
548 #ifdef DEBUG
549 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
550 			kprintf("ccd%d: interleave must be at least %d\n",
551 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
552 #endif
553 		error = EINVAL;
554 		goto fail;
555 	}
556 
557 	/*
558 	 * If uniform interleave is desired set all sizes to that of
559 	 * the smallest component.  This will guarentee that a single
560 	 * interleave table is generated.
561 	 *
562 	 * Lost space must be taken into account when calculating the
563 	 * overall size.  Half the space is lost when CCDF_MIRROR is
564 	 * specified.  One disk is lost when CCDF_PARITY is specified.
565 	 */
566 	if (ccd->ccd_flags & CCDF_UNIFORM) {
567 		for (ci = cs->sc_cinfo;
568 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
569 			ci->ci_size = minsize;
570 		}
571 		if (ccd->ccd_flags & CCDF_MIRROR) {
572 			/*
573 			 * Check to see if an even number of components
574 			 * have been specified.  The interleave must also
575 			 * be non-zero in order for us to be able to
576 			 * guarentee the topology.
577 			 */
578 			if (cs->sc_nccdisks % 2) {
579 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
580 				error = EINVAL;
581 				goto fail;
582 			}
583 			if (cs->sc_ileave == 0) {
584 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
585 				error = EINVAL;
586 				goto fail;
587 			}
588 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
589 		} else if (ccd->ccd_flags & CCDF_PARITY) {
590 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
591 		} else {
592 			if (cs->sc_ileave == 0) {
593 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
594 				error = EINVAL;
595 				goto fail;
596 			}
597 			cs->sc_size = cs->sc_nccdisks * minsize;
598 		}
599 	}
600 
601 	/*
602 	 * Construct the interleave table.
603 	 */
604 	ccdinterleave(cs, ccd->ccd_unit);
605 
606 	/*
607 	 * Create pseudo-geometry based on 1MB cylinders.  It's
608 	 * pretty close.
609 	 */
610 	ccg->ccg_secsize = maxsecsize;
611 	ccg->ccg_ntracks = 1;
612 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
613 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
614 
615 	/*
616 	 * Add an devstat entry for this device.
617 	 */
618 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
619 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
620 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
621 			  DEVSTAT_PRIORITY_ARRAY);
622 
623 	cs->sc_flags |= CCDF_INITED;
624 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
625 	cs->sc_unit = ccd->ccd_unit;
626 	return (0);
627 fail:
628 	while (ci > cs->sc_cinfo) {
629 		ci--;
630 		kfree(ci->ci_path, M_DEVBUF);
631 	}
632 	kfree(cs->sc_cinfo, M_DEVBUF);
633 	cs->sc_cinfo = NULL;
634 	return (error);
635 }
636 
637 static void
638 ccdinterleave(struct ccd_softc *cs, int unit)
639 {
640 	struct ccdcinfo *ci, *smallci;
641 	struct ccdiinfo *ii;
642 	u_int64_t bn;
643 	u_int64_t lbn;
644 	u_int64_t size;
645 	int icount;
646 	int ix;
647 
648 #ifdef DEBUG
649 	if (ccddebug & CCDB_INIT)
650 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
651 #endif
652 
653 	/*
654 	 * Allocate an interleave table.  The worst case occurs when each
655 	 * of N disks is of a different size, resulting in N interleave
656 	 * tables.
657 	 *
658 	 * Chances are this is too big, but we don't care.
659 	 */
660 	icount = cs->sc_nccdisks + 1;
661 	cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
662 				M_DEVBUF, M_WAITOK|M_ZERO);
663 
664 	/*
665 	 * Trivial case: no interleave (actually interleave of disk size).
666 	 * Each table entry represents a single component in its entirety.
667 	 *
668 	 * An interleave of 0 may not be used with a mirror or parity setup.
669 	 */
670 	if (cs->sc_ileave == 0) {
671 		bn = 0;
672 		ii = cs->sc_itable;
673 
674 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
675 			/* Allocate space for ii_index. */
676 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
677 			ii->ii_ndisk = 1;
678 			ii->ii_startblk = bn;
679 			ii->ii_startoff = 0;
680 			ii->ii_index[0] = ix;
681 			bn += cs->sc_cinfo[ix].ci_size;
682 			ii++;
683 		}
684 		ii->ii_ndisk = 0;
685 #ifdef DEBUG
686 		if (ccddebug & CCDB_INIT)
687 			printiinfo(cs->sc_itable);
688 #endif
689 		return;
690 	}
691 
692 	/*
693 	 * The following isn't fast or pretty; it doesn't have to be.
694 	 */
695 	size = 0;
696 	bn = lbn = 0;
697 	for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
698 		/*
699 		 * Allocate space for ii_index.  We might allocate more then
700 		 * we use.
701 		 */
702 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
703 					M_DEVBUF, M_WAITOK);
704 
705 		/*
706 		 * Locate the smallest of the remaining components
707 		 */
708 		smallci = NULL;
709 		ci = cs->sc_cinfo;
710 		while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
711 			if (ci->ci_size > size &&
712 			    (smallci == NULL ||
713 			     ci->ci_size < smallci->ci_size)) {
714 				smallci = ci;
715 			}
716 			++ci;
717 		}
718 
719 		/*
720 		 * Nobody left, all done
721 		 */
722 		if (smallci == NULL) {
723 			ii->ii_ndisk = 0;
724 			break;
725 		}
726 
727 		/*
728 		 * Record starting logical block using an sc_ileave blocksize.
729 		 */
730 		ii->ii_startblk = bn / cs->sc_ileave;
731 
732 		/*
733 		 * Record starting component block using an sc_ileave
734 		 * blocksize.  This value is relative to the beginning of
735 		 * a component disk.
736 		 */
737 		ii->ii_startoff = lbn;
738 
739 		/*
740 		 * Determine how many disks take part in this interleave
741 		 * and record their indices.
742 		 */
743 		ix = 0;
744 		for (ci = cs->sc_cinfo;
745 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
746 			if (ci->ci_size >= smallci->ci_size) {
747 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
748 			}
749 		}
750 		ii->ii_ndisk = ix;
751 
752 		/*
753 		 * Adjust for loop
754 		 */
755 		bn += ix * (smallci->ci_size - size);
756 		lbn = smallci->ci_size / cs->sc_ileave;
757 		size = smallci->ci_size;
758 	}
759 	if (ii == &cs->sc_itable[icount])
760 		panic("ccdinterlave software bug!  table exhausted");
761 #ifdef DEBUG
762 	if (ccddebug & CCDB_INIT)
763 		printiinfo(cs->sc_itable);
764 #endif
765 }
766 
767 /* ARGSUSED */
768 static int
769 ccdopen(struct dev_open_args *ap)
770 {
771 	cdev_t dev = ap->a_head.a_dev;
772 	int unit = ccdunit(dev);
773 	struct ccd_softc *cs;
774 	int error = 0;
775 
776 #ifdef DEBUG
777 	if (ccddebug & CCDB_FOLLOW)
778 		kprintf("ccdopen(%x, %x)\n", dev, flags);
779 #endif
780 	if (unit >= numccd)
781 		return (ENXIO);
782 	cs = &ccd_softc[unit];
783 
784 	if ((error = ccdlock(cs)) == 0) {
785 		ccdunlock(cs);
786 	}
787 	return (error);
788 }
789 
790 /* ARGSUSED */
791 static int
792 ccdclose(struct dev_close_args *ap)
793 {
794 	cdev_t dev = ap->a_head.a_dev;
795 	int unit = ccdunit(dev);
796 	struct ccd_softc *cs;
797 	int error = 0;
798 
799 #ifdef DEBUG
800 	if (ccddebug & CCDB_FOLLOW)
801 		kprintf("ccdclose(%x, %x)\n", dev, flags);
802 #endif
803 
804 	if (unit >= numccd)
805 		return (ENXIO);
806 	cs = &ccd_softc[unit];
807 	if ((error = ccdlock(cs)) == 0) {
808 		ccdunlock(cs);
809 	}
810 	return (error);
811 }
812 
813 static int
814 ccdstrategy(struct dev_strategy_args *ap)
815 {
816 	cdev_t dev = ap->a_head.a_dev;
817 	struct bio *bio = ap->a_bio;
818 	int unit = ccdunit(dev);
819 	struct bio *nbio;
820 	struct buf *bp = bio->bio_buf;
821 	struct ccd_softc *cs = &ccd_softc[unit];
822 	u_int64_t pbn;	/* in sc_secsize chunks */
823 	u_int32_t sz;	/* in sc_secsize chunks */
824 
825 #ifdef DEBUG
826 	if (ccddebug & CCDB_FOLLOW)
827 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
828 #endif
829 	if ((cs->sc_flags & CCDF_INITED) == 0) {
830 		bp->b_error = ENXIO;
831 		goto error;
832 	}
833 
834 	/* If it's a nil transfer, wake up the top half now. */
835 	if (bp->b_bcount == 0) {
836 		bp->b_resid = 0;
837 		goto done;
838 	}
839 
840 	/*
841 	 * Do bounds checking and adjust transfer.  If there's an
842 	 * error, the bounds check will flag that for us.
843 	 */
844 
845 	pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
846 	sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
847 
848 	/*
849 	 * If out of bounds return an error.  If the request goes
850 	 * past EOF, clip the request as appropriate.  If exactly
851 	 * at EOF, return success (don't clip), but with 0 bytes
852 	 * of I/O.
853 	 *
854 	 * Mark EOF B_INVAL (just like bad), indicating that the
855 	 * contents of the buffer, if any, is invalid.
856 	 */
857 	if ((int64_t)pbn < 0)
858 		goto bad;
859 	if (pbn + sz > cs->sc_size) {
860 		if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
861 			goto bad;
862 		if (pbn == cs->sc_size) {
863 			bp->b_resid = bp->b_bcount;
864 			bp->b_flags |= B_INVAL;
865 			goto done;
866 		}
867 		sz = (long)(cs->sc_size - pbn);
868 		bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
869 	}
870 	nbio = bio;
871 
872 	bp->b_resid = bp->b_bcount;
873 	nbio->bio_driver_info = dev;
874 
875 	/*
876 	 * "Start" the unit.
877 	 */
878 	crit_enter();
879 	ccdstart(cs, nbio);
880 	crit_exit();
881 	return(0);
882 
883 	/*
884 	 * note: bio, not nbio, is valid at the done label.
885 	 */
886 bad:
887 	bp->b_error = EINVAL;
888 error:
889 	bp->b_resid = bp->b_bcount;
890 	bp->b_flags |= B_ERROR | B_INVAL;
891 done:
892 	biodone(bio);
893 	return(0);
894 }
895 
896 static void
897 ccdstart(struct ccd_softc *cs, struct bio *bio)
898 {
899 	long bcount, rcount;
900 	struct ccdbuf *cbp[4];
901 	struct buf *bp = bio->bio_buf;
902 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
903 	caddr_t addr;
904 	off_t doffset;
905 
906 #ifdef DEBUG
907 	if (ccddebug & CCDB_FOLLOW)
908 		kprintf("ccdstart(%x, %x)\n", cs, bp);
909 #endif
910 
911 	/* Record the transaction start  */
912 	devstat_start_transaction(&cs->device_stats);
913 
914 	/*
915 	 * Allocate component buffers and fire off the requests
916 	 */
917 	doffset = bio->bio_offset;
918 	addr = bp->b_data;
919 
920 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
921 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
922 		rcount = cbp[0]->cb_buf.b_bcount;
923 
924 		if (cs->sc_cflags & CCDF_MIRROR) {
925 			/*
926 			 * Mirroring.  Writes go to both disks, reads are
927 			 * taken from whichever disk seems most appropriate.
928 			 *
929 			 * We attempt to localize reads to the disk whos arm
930 			 * is nearest the read request.  We ignore seeks due
931 			 * to writes when making this determination and we
932 			 * also try to avoid hogging.
933 			 */
934 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
935 				vn_strategy(cbp[0]->cb_vp,
936 					    &cbp[0]->cb_buf.b_bio1);
937 				vn_strategy(cbp[1]->cb_vp,
938 					    &cbp[1]->cb_buf.b_bio1);
939 			} else {
940 				int pick = cs->sc_pick;
941 				daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
942 				if (doffset < cs->sc_blk[pick] - range ||
943 				    doffset > cs->sc_blk[pick] + range
944 				) {
945 					cs->sc_pick = pick = 1 - pick;
946 				}
947 				cs->sc_blk[pick] = doffset + rcount;
948 				vn_strategy(cbp[pick]->cb_vp,
949 					    &cbp[pick]->cb_buf.b_bio1);
950 			}
951 		} else {
952 			/*
953 			 * Not mirroring
954 			 */
955 			vn_strategy(cbp[0]->cb_vp,
956 				     &cbp[0]->cb_buf.b_bio1);
957 		}
958 		doffset += rcount;
959 		addr += rcount;
960 	}
961 }
962 
963 /*
964  * Build a component buffer header.
965  */
966 static void
967 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
968 	  off_t doffset, caddr_t addr, long bcount)
969 {
970 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
971 	struct ccdbuf *cbp;
972 	u_int64_t bn;
973 	u_int64_t cbn;
974 	u_int64_t cboff;
975 	off_t cbc;
976 
977 #ifdef DEBUG
978 	if (ccddebug & CCDB_IO)
979 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
980 		       cs, bp, bn, addr, bcount);
981 #endif
982 	/*
983 	 * Determine which component bn falls in.
984 	 */
985 	bn = doffset / cs->sc_geom.ccg_secsize;
986 	cbn = bn;
987 	cboff = 0;
988 
989 	if (cs->sc_ileave == 0) {
990 		/*
991 		 * Serially concatenated and neither a mirror nor a parity
992 		 * config.  This is a special case.
993 		 */
994 		daddr_t sblk;
995 
996 		sblk = 0;
997 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
998 			sblk += ci->ci_size;
999 		cbn -= sblk;
1000 	} else {
1001 		struct ccdiinfo *ii;
1002 		int ccdisk, off;
1003 
1004 		/*
1005 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
1006 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
1007 		 * to cbn.
1008 		 */
1009 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
1010 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
1011 
1012 		/*
1013 		 * Figure out which interleave table to use.
1014 		 */
1015 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1016 			if (ii->ii_startblk > cbn)
1017 				break;
1018 		}
1019 		ii--;
1020 
1021 		/*
1022 		 * off is the logical superblock relative to the beginning
1023 		 * of this interleave block.
1024 		 */
1025 		off = cbn - ii->ii_startblk;
1026 
1027 		/*
1028 		 * We must calculate which disk component to use (ccdisk),
1029 		 * and recalculate cbn to be the superblock relative to
1030 		 * the beginning of the component.  This is typically done by
1031 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1032 		 * must typically be divided by the number of components in
1033 		 * this interleave array to be properly convert it from a
1034 		 * CCD-relative logical superblock number to a
1035 		 * component-relative superblock number.
1036 		 */
1037 		if (ii->ii_ndisk == 1) {
1038 			/*
1039 			 * When we have just one disk, it can't be a mirror
1040 			 * or a parity config.
1041 			 */
1042 			ccdisk = ii->ii_index[0];
1043 			cbn = ii->ii_startoff + off;
1044 		} else {
1045 			if (cs->sc_cflags & CCDF_MIRROR) {
1046 				/*
1047 				 * We have forced a uniform mapping, resulting
1048 				 * in a single interleave array.  We double
1049 				 * up on the first half of the available
1050 				 * components and our mirror is in the second
1051 				 * half.  This only works with a single
1052 				 * interleave array because doubling up
1053 				 * doubles the number of sectors, so there
1054 				 * cannot be another interleave array because
1055 				 * the next interleave array's calculations
1056 				 * would be off.
1057 				 */
1058 				int ndisk2 = ii->ii_ndisk / 2;
1059 				ccdisk = ii->ii_index[off % ndisk2];
1060 				cbn = ii->ii_startoff + off / ndisk2;
1061 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1062 			} else if (cs->sc_cflags & CCDF_PARITY) {
1063 				/*
1064 				 * XXX not implemented yet
1065 				 */
1066 				int ndisk2 = ii->ii_ndisk - 1;
1067 				ccdisk = ii->ii_index[off % ndisk2];
1068 				cbn = ii->ii_startoff + off / ndisk2;
1069 				if (cbn % ii->ii_ndisk <= ccdisk)
1070 					ccdisk++;
1071 			} else {
1072 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1073 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1074 			}
1075 		}
1076 
1077 		ci = &cs->sc_cinfo[ccdisk];
1078 
1079 		/*
1080 		 * Convert cbn from a superblock to a normal block so it
1081 		 * can be used to calculate (along with cboff) the normal
1082 		 * block index into this particular disk.
1083 		 */
1084 		cbn *= cs->sc_ileave;
1085 	}
1086 
1087 	/*
1088 	 * Fill in the component buf structure.
1089 	 *
1090 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1091 	 * will be truncated on device EOF so we use b_bufsize to detect
1092 	 * the case.
1093 	 */
1094 	cbp = getccdbuf();
1095 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1096 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1097 	cbp->cb_buf.b_data = addr;
1098 	cbp->cb_vp = ci->ci_vp;
1099 	if (cs->sc_ileave == 0)
1100 		cbc = dbtob((off_t)(ci->ci_size - cbn));
1101 	else
1102 		cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1103 	if (cbc > cs->sc_maxiosize)
1104 		cbc = cs->sc_maxiosize;
1105 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1106  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1107 
1108 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1109 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1110 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1111 
1112 	/*
1113 	 * context for ccdiodone
1114 	 */
1115 	cbp->cb_obio = bio;
1116 	cbp->cb_unit = cs - ccd_softc;
1117 	cbp->cb_comp = ci - cs->sc_cinfo;
1118 
1119 #ifdef DEBUG
1120 	if (ccddebug & CCDB_IO)
1121 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1122 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1123 		       cbp->cb_buf.b_bio1.bio_offset,
1124 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1125 #endif
1126 	cb[0] = cbp;
1127 
1128 	/*
1129 	 * Note: both I/O's setup when reading from mirror, but only one
1130 	 * will be executed.
1131 	 */
1132 	if (cs->sc_cflags & CCDF_MIRROR) {
1133 		/* mirror, setup second I/O */
1134 		cbp = getccdbuf();
1135 
1136 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1137 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1138 		cbp->cb_buf.b_data = addr;
1139 		cbp->cb_vp = ci2->ci_vp;
1140 		if (cs->sc_ileave == 0)
1141 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1142 		else
1143 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1144 		if (cbc > cs->sc_maxiosize)
1145 			cbc = cs->sc_maxiosize;
1146 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1147 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1148 
1149 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1150 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1151 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1152 
1153 		/*
1154 		 * context for ccdiodone
1155 		 */
1156 		cbp->cb_obio = bio;
1157 		cbp->cb_unit = cs - ccd_softc;
1158 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1159 		cb[1] = cbp;
1160 		/* link together the ccdbuf's and clear "mirror done" flag */
1161 		cb[0]->cb_mirror = cb[1];
1162 		cb[1]->cb_mirror = cb[0];
1163 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1164 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1165 	}
1166 }
1167 
1168 static void
1169 ccdintr(struct ccd_softc *cs, struct bio *bio)
1170 {
1171 	struct buf *bp = bio->bio_buf;
1172 
1173 #ifdef DEBUG
1174 	if (ccddebug & CCDB_FOLLOW)
1175 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1176 #endif
1177 	/*
1178 	 * Request is done for better or worse, wakeup the top half.
1179 	 */
1180 	if (bp->b_flags & B_ERROR)
1181 		bp->b_resid = bp->b_bcount;
1182 	devstat_end_transaction_buf(&cs->device_stats, bp);
1183 	biodone(bio);
1184 }
1185 
1186 /*
1187  * Called at interrupt time.
1188  *
1189  * Mark the component as done and if all components are done,
1190  * take a ccd interrupt.
1191  */
1192 static void
1193 ccdiodone(struct bio *bio)
1194 {
1195 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1196 	struct bio *obio = cbp->cb_obio;
1197 	struct buf *obp = obio->bio_buf;
1198 	int unit = cbp->cb_unit;
1199 	int count;
1200 
1201 	/*
1202 	 * Since we do not have exclusive access to underlying devices,
1203 	 * we can't keep cache translations around.
1204 	 */
1205 	clearbiocache(bio->bio_next);
1206 
1207 	get_mplock();
1208 	crit_enter();
1209 #ifdef DEBUG
1210 	if (ccddebug & CCDB_FOLLOW)
1211 		kprintf("ccdiodone(%x)\n", cbp);
1212 	if (ccddebug & CCDB_IO) {
1213 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1214 		       obp, obp->b_bcount, obp->b_resid);
1215 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1216 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1217 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1218 		       cbp->cb_buf.b_bcount);
1219 	}
1220 #endif
1221 
1222 	/*
1223 	 * If an error occured, report it.  If this is a mirrored
1224 	 * configuration and the first of two possible reads, do not
1225 	 * set the error in the bp yet because the second read may
1226 	 * succeed.
1227 	 */
1228 	if (cbp->cb_buf.b_flags & B_ERROR) {
1229 		const char *msg = "";
1230 
1231 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1232 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1233 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1234 			/*
1235 			 * We will try our read on the other disk down
1236 			 * below, also reverse the default pick so if we
1237 			 * are doing a scan we do not keep hitting the
1238 			 * bad disk first.
1239 			 */
1240 			struct ccd_softc *cs = &ccd_softc[unit];
1241 
1242 			msg = ", trying other disk";
1243 			cs->sc_pick = 1 - cs->sc_pick;
1244 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1245 		} else {
1246 			obp->b_flags |= B_ERROR;
1247 			obp->b_error = cbp->cb_buf.b_error ?
1248 			    cbp->cb_buf.b_error : EIO;
1249 		}
1250 		kprintf("ccd%d: error %d on component %d "
1251 			"offset %jd (ccd offset %jd)%s\n",
1252 		        unit, obp->b_error, cbp->cb_comp,
1253 		        (intmax_t)cbp->cb_buf.b_bio2.bio_offset,
1254 		        (intmax_t)obio->bio_offset,
1255 		        msg);
1256 	}
1257 
1258 	/*
1259 	 * Process mirror.  If we are writing, I/O has been initiated on both
1260 	 * buffers and we fall through only after both are finished.
1261 	 *
1262 	 * If we are reading only one I/O is initiated at a time.  If an
1263 	 * error occurs we initiate the second I/O and return, otherwise
1264 	 * we free the second I/O without initiating it.
1265 	 */
1266 
1267 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1268 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1269 			/*
1270 			 * When writing, handshake with the second buffer
1271 			 * to determine when both are done.  If both are not
1272 			 * done, return here.
1273 			 */
1274 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1275 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1276 				putccdbuf(cbp);
1277 				crit_exit();
1278 				rel_mplock();
1279 				return;
1280 			}
1281 		} else {
1282 			/*
1283 			 * When reading, either dispose of the second buffer
1284 			 * or initiate I/O on the second buffer if an error
1285 			 * occured with this one.
1286 			 */
1287 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1288 				if (cbp->cb_buf.b_flags & B_ERROR) {
1289 					cbp->cb_mirror->cb_pflags |=
1290 					    CCDPF_MIRROR_DONE;
1291 					vn_strategy(
1292 					    cbp->cb_mirror->cb_vp,
1293 					    &cbp->cb_mirror->cb_buf.b_bio1
1294 					);
1295 					putccdbuf(cbp);
1296 					crit_exit();
1297 					rel_mplock();
1298 					return;
1299 				} else {
1300 					putccdbuf(cbp->cb_mirror);
1301 					/* fall through */
1302 				}
1303 			}
1304 		}
1305 	}
1306 
1307 	/*
1308 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1309 	 */
1310 	count = cbp->cb_buf.b_bufsize;
1311 	putccdbuf(cbp);
1312 
1313 	/*
1314 	 * If all done, "interrupt".
1315 	 */
1316 	obp->b_resid -= count;
1317 	if (obp->b_resid < 0)
1318 		panic("ccdiodone: count");
1319 	if (obp->b_resid == 0)
1320 		ccdintr(&ccd_softc[unit], obio);
1321 	crit_exit();
1322 	rel_mplock();
1323 }
1324 
1325 static int
1326 ccdioctl(struct dev_ioctl_args *ap)
1327 {
1328 	cdev_t dev = ap->a_head.a_dev;
1329 	int unit = ccdunit(dev);
1330 	int i, j, lookedup = 0, error = 0;
1331 	struct ccd_softc *cs;
1332 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1333 	struct ccddevice ccd;
1334 	struct disk_info info;
1335 	char **cpp;
1336 	struct vnode **vpp;
1337 
1338 	if (unit >= numccd)
1339 		return (ENXIO);
1340 	cs = &ccd_softc[unit];
1341 
1342 	bzero(&ccd, sizeof(ccd));
1343 
1344 	switch (ap->a_cmd) {
1345 	case CCDIOCSET:
1346 		if (cs->sc_flags & CCDF_INITED)
1347 			return (EBUSY);
1348 
1349 		if ((ap->a_fflag & FWRITE) == 0)
1350 			return (EBADF);
1351 
1352 		if ((error = ccdlock(cs)) != 0)
1353 			return (error);
1354 
1355 		if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1356 			ccdunlock(cs);
1357 			return (EINVAL);
1358 		}
1359 
1360 		/* Fill in some important bits. */
1361 		ccd.ccd_unit = unit;
1362 		ccd.ccd_interleave = ccio->ccio_ileave;
1363 		if (ccd.ccd_interleave == 0 &&
1364 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1365 		     (ccio->ccio_flags & CCDF_PARITY))) {
1366 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1367 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1368 		}
1369 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1370 		    (ccio->ccio_flags & CCDF_PARITY)) {
1371 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1372 			ccio->ccio_flags &= ~CCDF_PARITY;
1373 		}
1374 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1375 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1376 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1377 			       unit);
1378 			ccio->ccio_flags |= CCDF_UNIFORM;
1379 		}
1380 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1381 
1382 		/*
1383 		 * Allocate space for and copy in the array of
1384 		 * componet pathnames and device numbers.
1385 		 */
1386 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1387 		    M_DEVBUF, M_WAITOK);
1388 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1389 		    M_DEVBUF, M_WAITOK);
1390 
1391 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1392 				ccio->ccio_ndisks * sizeof(char **));
1393 		if (error) {
1394 			kfree(vpp, M_DEVBUF);
1395 			kfree(cpp, M_DEVBUF);
1396 			ccdunlock(cs);
1397 			return (error);
1398 		}
1399 
1400 #ifdef DEBUG
1401 		if (ccddebug & CCDB_INIT) {
1402 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1403 				kprintf("ccdioctl: component %d: 0x%x\n",
1404 				    i, cpp[i]);
1405 		}
1406 #endif
1407 
1408 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1409 #ifdef DEBUG
1410 			if (ccddebug & CCDB_INIT)
1411 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1412 #endif
1413 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1414 				for (j = 0; j < lookedup; ++j)
1415 					(void)vn_close(vpp[j], FREAD|FWRITE);
1416 				kfree(vpp, M_DEVBUF);
1417 				kfree(cpp, M_DEVBUF);
1418 				ccdunlock(cs);
1419 				return (error);
1420 			}
1421 			++lookedup;
1422 		}
1423 		ccd.ccd_cpp = cpp;
1424 		ccd.ccd_vpp = vpp;
1425 		ccd.ccd_ndev = ccio->ccio_ndisks;
1426 
1427 		/*
1428 		 * Initialize the ccd.  Fills in the softc for us.
1429 		 */
1430 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1431 			for (j = 0; j < lookedup; ++j)
1432 				(void)vn_close(vpp[j], FREAD|FWRITE);
1433 			kfree(vpp, M_DEVBUF);
1434 			kfree(cpp, M_DEVBUF);
1435 			ccdunlock(cs);
1436 			return (error);
1437 		}
1438 
1439 		/*
1440 		 * The ccd has been successfully initialized, so
1441 		 * we can place it into the array and read the disklabel.
1442 		 */
1443 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1444 		ccio->ccio_unit = unit;
1445 		ccio->ccio_size = cs->sc_size;
1446 
1447 		bzero(&info, sizeof(info));
1448 		info.d_media_blksize = cs->sc_geom.ccg_secsize;
1449 		info.d_media_blocks  = cs->sc_size;
1450 		info.d_nheads	     = cs->sc_geom.ccg_ntracks;
1451 		info.d_secpertrack   = cs->sc_geom.ccg_nsectors;
1452 		info.d_ncylinders    = cs->sc_geom.ccg_ncylinders;
1453 		info.d_secpercyl     = info.d_nheads * info.d_secpertrack;
1454 
1455 		/*
1456 		 * For cases where a label is directly applied to the ccd,
1457 		 * without slices, DSO_COMPATMBR forces one sector be
1458 		 * reserved for backwards compatibility.
1459 		 */
1460 		info.d_dsflags	     = DSO_COMPATMBR;
1461 		disk_setdiskinfo(&cs->sc_disk, &info);
1462 
1463 		ccdunlock(cs);
1464 
1465 		break;
1466 
1467 	case CCDIOCCLR:
1468 		if ((cs->sc_flags & CCDF_INITED) == 0)
1469 			return (ENXIO);
1470 
1471 		if ((ap->a_fflag & FWRITE) == 0)
1472 			return (EBADF);
1473 
1474 		if ((error = ccdlock(cs)) != 0)
1475 			return (error);
1476 
1477 		if (dev_drefs(cs->sc_dev) > 1) {
1478 			ccdunlock(cs);
1479 			return (EBUSY);
1480 		}
1481 
1482 		/*
1483 		 * Free ccd_softc information and clear entry.
1484 		 */
1485 
1486 		/* Close the components and free their pathnames. */
1487 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1488 			/*
1489 			 * XXX: this close could potentially fail and
1490 			 * cause Bad Things.  Maybe we need to force
1491 			 * the close to happen?
1492 			 */
1493 #ifdef DEBUG
1494 			if (ccddebug & CCDB_VNODE)
1495 				vprint("CCDIOCCLR: vnode info",
1496 				    cs->sc_cinfo[i].ci_vp);
1497 #endif
1498 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1499 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1500 		}
1501 
1502 		/* Free interleave index. */
1503 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1504 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1505 
1506 		/* Free component info and interleave table. */
1507 		kfree(cs->sc_cinfo, M_DEVBUF);
1508 		kfree(cs->sc_itable, M_DEVBUF);
1509 		cs->sc_cinfo = NULL;
1510 		cs->sc_itable = NULL;
1511 		cs->sc_flags &= ~CCDF_INITED;
1512 
1513 		/*
1514 		 * Free ccddevice information and clear entry.
1515 		 */
1516 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1517 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1518 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1519 
1520 		/*
1521 		 * And remove the devstat entry.
1522 		 */
1523 		devstat_remove_entry(&cs->device_stats);
1524 
1525 		/* This must be atomic. */
1526 		crit_enter();
1527 		ccdunlock(cs);
1528 		crit_exit();
1529 
1530 		break;
1531 
1532 	default:
1533 		return (ENOTTY);
1534 	}
1535 
1536 	return (0);
1537 }
1538 
1539 static int
1540 ccddump(struct dev_dump_args *ap)
1541 {
1542 	/* Not implemented. */
1543 	return ENXIO;
1544 }
1545 
1546 /*
1547  * Lookup the provided name in the filesystem.  If the file exists,
1548  * is a valid block device, and isn't being used by anyone else,
1549  * set *vpp to the file's vnode.
1550  */
1551 static int
1552 ccdlookup(char *path, struct vnode **vpp)
1553 {
1554 	struct nlookupdata nd;
1555 	struct vnode *vp;
1556 	int error;
1557 
1558 	*vpp = NULL;
1559 
1560 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1561 	if (error)
1562 		return (error);
1563 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1564 #ifdef DEBUG
1565 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1566 			kprintf("ccdlookup: vn_open error = %d\n", error);
1567 #endif
1568 		goto done;
1569 	}
1570 	vp = nd.nl_open_vp;
1571 
1572 	if (vp->v_opencount > 1) {
1573 		error = EBUSY;
1574 		goto done;
1575 	}
1576 
1577 	if (!vn_isdisk(vp, &error))
1578 		goto done;
1579 
1580 #ifdef DEBUG
1581 	if (ccddebug & CCDB_VNODE)
1582 		vprint("ccdlookup: vnode info", vp);
1583 #endif
1584 
1585 	vn_unlock(vp);
1586 	nd.nl_open_vp = NULL;
1587 	nlookup_done(&nd);
1588 	*vpp = vp;				/* leave ref intact  */
1589 	return (0);
1590 done:
1591 	nlookup_done(&nd);
1592 	return (error);
1593 }
1594 
1595 /*
1596  * Wait interruptibly for an exclusive lock.
1597  *
1598  * XXX
1599  * Several drivers do this; it should be abstracted and made MP-safe.
1600  */
1601 static int
1602 ccdlock(struct ccd_softc *cs)
1603 {
1604 	int error;
1605 
1606 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1607 		cs->sc_flags |= CCDF_WANTED;
1608 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1609 			return (error);
1610 	}
1611 	cs->sc_flags |= CCDF_LOCKED;
1612 	return (0);
1613 }
1614 
1615 /*
1616  * Unlock and wake up any waiters.
1617  */
1618 static void
1619 ccdunlock(struct ccd_softc *cs)
1620 {
1621 
1622 	cs->sc_flags &= ~CCDF_LOCKED;
1623 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1624 		cs->sc_flags &= ~CCDF_WANTED;
1625 		wakeup(cs);
1626 	}
1627 }
1628 
1629 #ifdef DEBUG
1630 static void
1631 printiinfo(struct ccdiinfo *ii)
1632 {
1633 	int ix, i;
1634 
1635 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1636 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1637 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1638 		for (i = 0; i < ii->ii_ndisk; i++)
1639 			kprintf(" %d", ii->ii_index[i]);
1640 		kprintf("\n");
1641 	}
1642 }
1643 #endif
1644 
1645 
1646 /* Local Variables: */
1647 /* c-argdecl-indent: 8 */
1648 /* c-continued-statement-offset: 8 */
1649 /* c-indent-level: 8 */
1650 /* End: */
1651