xref: /dflybsd-src/sys/dev/disk/ccd/ccd.c (revision 330d3c4b487f3fc5d0eb023645b0b2a569f7048e)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
35  */
36 /*
37  * Copyright (c) 1995 Jason R. Thorpe.
38  * All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed for the NetBSD Project
51  *	by Jason R. Thorpe.
52  * 4. The name of the author may not be used to endorse or promote products
53  *    derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  * Copyright (c) 1990, 1993
71  *	The Regents of the University of California.  All rights reserved.
72  *
73  * This code is derived from software contributed to Berkeley by
74  * the Systems Programming Group of the University of Utah Computer
75  * Science Department.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  * 3. All advertising materials mentioning features or use of this software
86  *    must display the following acknowledgement:
87  *	This product includes software developed by the University of
88  *	California, Berkeley and its contributors.
89  * 4. Neither the name of the University nor the names of its contributors
90  *    may be used to endorse or promote products derived from this software
91  *    without specific prior written permission.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
103  * SUCH DAMAGE.
104  *
105  * from: Utah $Hdr: cd.c 1.6 90/11/28$
106  */
107 /*
108  * @(#)cd.c	8.2 (Berkeley) 11/16/93
109  * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
112  */
113 
114 /*
115  * "Concatenated" disk driver.
116  *
117  * Original dynamic configuration support by:
118  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
119  *	Numerical Aerodynamic Simulation Facility
120  *	Mail Stop 258-6
121  *	NASA Ames Research Center
122  *	Moffett Field, CA 94035
123  */
124 
125 #include "use_ccd.h"
126 
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
132 #include <sys/buf.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/ccdvar.h>
145 
146 #include <vm/vm_zone.h>
147 
148 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
149 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
150 
151 #include <sys/thread2.h>
152 #include <sys/buf2.h>
153 #include <sys/mplock2.h>
154 
155 #if defined(CCDDEBUG) && !defined(DEBUG)
156 #define DEBUG
157 #endif
158 
159 #ifdef DEBUG
160 #define CCDB_FOLLOW	0x01
161 #define CCDB_INIT	0x02
162 #define CCDB_IO		0x04
163 #define CCDB_LABEL	0x08
164 #define CCDB_VNODE	0x10
165 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
166     CCDB_VNODE;
167 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
168 #undef DEBUG
169 #endif
170 
171 #define	ccdunit(x)	dkunit(x)
172 #define ccdpart(x)	dkpart(x)
173 
174 /*
175    This is how mirroring works (only writes are special):
176 
177    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
178    linked together by the cb_mirror field.  "cb_pflags &
179    CCDPF_MIRROR_DONE" is set to 0 on both of them.
180 
181    When a component returns to ccdiodone(), it checks if "cb_pflags &
182    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
183    flag and returns.  If it is, it means its partner has already
184    returned, so it will go to the regular cleanup.
185 
186  */
187 
188 struct ccdbuf {
189 	struct buf	cb_buf;		/* new I/O buf */
190 	struct vnode	*cb_vp;		/* related vnode */
191 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
192 	struct ccdbuf	*cb_freenext;	/* free list link */
193 	int		cb_unit;	/* target unit */
194 	int		cb_comp;	/* target component */
195 	int		cb_pflags;	/* mirror/parity status flag */
196 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
197 };
198 
199 /* bits in cb_pflags */
200 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
201 
202 static d_open_t ccdopen;
203 static d_close_t ccdclose;
204 static d_strategy_t ccdstrategy;
205 static d_ioctl_t ccdioctl;
206 static d_dump_t ccddump;
207 
208 #define NCCDFREEHIWAT	16
209 
210 #define CDEV_MAJOR 74
211 
212 static struct dev_ops ccd_ops = {
213 	{ "ccd", CDEV_MAJOR, D_DISK },
214 	.d_open =	ccdopen,
215 	.d_close =	ccdclose,
216 	.d_read =	physread,
217 	.d_write =	physwrite,
218 	.d_ioctl =	ccdioctl,
219 	.d_strategy =	ccdstrategy,
220 	.d_dump =	ccddump
221 };
222 
223 /* called during module initialization */
224 static	void ccdattach (void);
225 static	int ccddetach (void);
226 static	int ccd_modevent (module_t, int, void *);
227 
228 /* called by biodone() at interrupt time */
229 static	void ccdiodone (struct bio *bio);
230 
231 static	void ccdstart (struct ccd_softc *, struct bio *);
232 static	void ccdinterleave (struct ccd_softc *, int);
233 static	void ccdintr (struct ccd_softc *, struct bio *);
234 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
235 static	int ccdlookup (char *, struct vnode **);
236 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
237 		struct bio *, off_t, caddr_t, long);
238 static	int ccdlock (struct ccd_softc *);
239 static	void ccdunlock (struct ccd_softc *);
240 
241 #ifdef DEBUG
242 static	void printiinfo (struct ccdiinfo *);
243 #endif
244 
245 /* Non-private for the benefit of libkvm. */
246 struct	ccd_softc *ccd_softc;
247 struct	ccddevice *ccddevs;
248 struct	ccdbuf *ccdfreebufs;
249 static	int numccdfreebufs;
250 static	int numccd = 0;
251 
252 /*
253  * getccdbuf() -	Allocate and zero a ccd buffer.
254  *
255  *	This routine is called at splbio().
256  */
257 
258 static __inline
259 struct ccdbuf *
260 getccdbuf(void)
261 {
262 	struct ccdbuf *cbp;
263 
264 	/*
265 	 * Allocate from freelist or malloc as necessary
266 	 */
267 	if ((cbp = ccdfreebufs) != NULL) {
268 		ccdfreebufs = cbp->cb_freenext;
269 		--numccdfreebufs;
270 		reinitbufbio(&cbp->cb_buf);
271 	} else {
272 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
273 		initbufbio(&cbp->cb_buf);
274 	}
275 
276 	/*
277 	 * independant struct buf initialization
278 	 */
279 	buf_dep_init(&cbp->cb_buf);
280 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
281 	BUF_KERNPROC(&cbp->cb_buf);
282 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
283 
284 	return(cbp);
285 }
286 
287 /*
288  * putccdbuf() -	Free a ccd buffer.
289  *
290  *	This routine is called at splbio().
291  */
292 
293 static __inline
294 void
295 putccdbuf(struct ccdbuf *cbp)
296 {
297 	BUF_UNLOCK(&cbp->cb_buf);
298 
299 	if (numccdfreebufs < NCCDFREEHIWAT) {
300 		cbp->cb_freenext = ccdfreebufs;
301 		ccdfreebufs = cbp;
302 		++numccdfreebufs;
303 	} else {
304 		uninitbufbio(&cbp->cb_buf);
305 		kfree((caddr_t)cbp, M_DEVBUF);
306 	}
307 }
308 
309 /*
310  * Called by main() during pseudo-device attachment.  All we need
311  * to do is allocate enough space for devices to be configured later, and
312  * add devsw entries.
313  */
314 static void
315 ccdattach(void)
316 {
317 	struct disk_info info;
318 	struct ccd_softc *cs;
319 	int i;
320 	int num = NCCD;
321 
322 	if (num > 1)
323 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
324 	else
325 		kprintf("ccd0: Concatenated disk driver\n");
326 
327 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
328 			    M_WAITOK | M_ZERO);
329 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
330 			    M_WAITOK | M_ZERO);
331 	numccd = num;
332 
333 	/*
334 	 * With normal disk devices the open simply fails if the media
335 	 * is not present.  With CCD we have to be able to open the
336 	 * raw disk to use the ioctl's to set it up, so create a dummy
337 	 * disk info structure so dscheck() doesn't blow up.
338 	 */
339 	bzero(&info, sizeof(info));
340 	info.d_media_blksize = DEV_BSIZE;
341 
342 	for (i = 0; i < numccd; ++i) {
343 		cs = &ccd_softc[i];
344 		cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
345 		cs->sc_dev->si_drv1 = cs;
346 		cs->sc_dev->si_iosize_max = 256 * 512;	/* XXX */
347 		disk_setdiskinfo(&cs->sc_disk, &info);
348 	}
349 }
350 
351 static int
352 ccddetach(void)
353 {
354 	struct ccd_softc *cs;
355 	struct dev_ioctl_args ioctl_args;
356 	int i;
357 	int error = 0;
358 	int eval;
359 
360 	bzero(&ioctl_args, sizeof(ioctl_args));
361 
362 	for (i = 0; i < numccd; ++i) {
363 		cs = &ccd_softc[i];
364 		if (cs->sc_dev == NULL)
365 			continue;
366 		ioctl_args.a_head.a_dev = cs->sc_dev;
367 		ioctl_args.a_cmd = CCDIOCCLR;
368 		ioctl_args.a_fflag = FWRITE;
369 		eval = ccdioctl(&ioctl_args);
370 		if (eval && eval != ENXIO) {
371 			kprintf("ccd%d: In use, cannot detach\n", i);
372 			error = EBUSY;
373 		}
374 	}
375 	if (error == 0) {
376 		for (i = 0; i < numccd; ++i) {
377 			cs = &ccd_softc[i];
378 			if (cs->sc_dev == NULL)
379 				continue;
380 			disk_destroy(&cs->sc_disk);
381 			cs->sc_dev = NULL;
382 		}
383 		if (ccd_softc)
384 			kfree(ccd_softc, M_DEVBUF);
385 		if (ccddevs)
386 			kfree(ccddevs, M_DEVBUF);
387 	}
388 	return (error);
389 }
390 
391 static int
392 ccd_modevent(module_t mod, int type, void *data)
393 {
394 	int error = 0;
395 
396 	switch (type) {
397 	case MOD_LOAD:
398 		ccdattach();
399 		break;
400 
401 	case MOD_UNLOAD:
402 		error = ccddetach();
403 		break;
404 
405 	default:	/* MOD_SHUTDOWN etc */
406 		break;
407 	}
408 	return (error);
409 }
410 
411 DEV_MODULE(ccd, ccd_modevent, NULL);
412 
413 static int
414 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
415 {
416 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
417 	struct ccdcinfo *ci = NULL;	/* XXX */
418 	int ix;
419 	struct vnode *vp;
420 	u_int64_t skip;
421 	u_int64_t size;
422 	u_int64_t minsize;
423 	int maxsecsize;
424 	struct partinfo dpart;
425 	struct ccdgeom *ccg = &cs->sc_geom;
426 	char tmppath[MAXPATHLEN];
427 	int error = 0;
428 
429 #ifdef DEBUG
430 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
432 #endif
433 
434 	cs->sc_size = 0;
435 	cs->sc_ileave = ccd->ccd_interleave;
436 	cs->sc_nccdisks = ccd->ccd_ndev;
437 
438 	/* Allocate space for the component info. */
439 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
440 				M_DEVBUF, M_WAITOK);
441 	cs->sc_maxiosize = MAXPHYS;
442 
443 	/*
444 	 * Verify that each component piece exists and record
445 	 * relevant information about it.
446 	 */
447 	maxsecsize = 0;
448 	minsize = 0;
449 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
450 		vp = ccd->ccd_vpp[ix];
451 		ci = &cs->sc_cinfo[ix];
452 		ci->ci_vp = vp;
453 
454 		/*
455 		 * Copy in the pathname of the component.
456 		 */
457 		bzero(tmppath, sizeof(tmppath));	/* sanity */
458 		if ((error = copyinstr(cpaths[ix], tmppath,
459 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
460 #ifdef DEBUG
461 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
462 				kprintf("ccd%d: can't copy path, error = %d\n",
463 				    ccd->ccd_unit, error);
464 #endif
465 			goto fail;
466 		}
467 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
468 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
469 
470 		ci->ci_dev = vn_todev(vp);
471 		if (ci->ci_dev->si_iosize_max &&
472 		    cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
473 			cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
474 		}
475 
476 		/*
477 		 * Get partition information for the component.
478 		 */
479 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD,
480 				  cred, NULL);
481 		if (error) {
482 #ifdef DEBUG
483 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
484 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
485 				     ccd->ccd_unit, ci->ci_path, error);
486 #endif
487 			goto fail;
488 		}
489 		if (dpart.fstype != FS_CCD &&
490 		    !kuuid_is_ccd(&dpart.fstype_uuid)) {
491 			kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
492 				ccd->ccd_unit, ci->ci_path);
493 			error = EFTYPE;
494 			goto fail;
495 		}
496 		if (maxsecsize < dpart.media_blksize)
497 			maxsecsize = dpart.media_blksize;
498 
499 		/*
500 		 * Skip a certain amount of storage at the beginning of
501 		 * the component to make sure we don't infringe on any
502 		 * reserved sectors.  This is handled entirely by
503 		 * dpart.reserved_blocks but we also impose a minimum
504 		 * of 16 sectors for backwards compatibility.
505 		 */
506 		skip = 16;
507 		if (skip < dpart.reserved_blocks)
508 			skip = dpart.reserved_blocks;
509 		size = dpart.media_blocks - skip;
510 
511 		/*
512 		 * Calculate the size, truncating to an interleave
513 		 * boundary if necessary.
514 		 */
515 		if (cs->sc_ileave > 1)
516 			size -= size % cs->sc_ileave;
517 
518 		if ((int64_t)size <= 0) {
519 #ifdef DEBUG
520 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
521 				kprintf("ccd%d: %s: size == 0\n",
522 				    ccd->ccd_unit, ci->ci_path);
523 #endif
524 			error = ENODEV;
525 			goto fail;
526 		}
527 
528 		/*
529 		 * Calculate the smallest uniform component, used
530 		 * elsewhere.
531 		 */
532 		if (minsize == 0 || minsize > size)
533 			minsize = size;
534 		ci->ci_skip = skip;
535 		ci->ci_size = size;
536 		cs->sc_size += size;
537 	}
538 	kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
539 		cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
540 
541 	/*
542 	 * Don't allow the interleave to be smaller than
543 	 * the biggest component sector.
544 	 */
545 	if ((cs->sc_ileave > 0) &&
546 	    (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
547 #ifdef DEBUG
548 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
549 			kprintf("ccd%d: interleave must be at least %d\n",
550 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
551 #endif
552 		error = EINVAL;
553 		goto fail;
554 	}
555 
556 	/*
557 	 * If uniform interleave is desired set all sizes to that of
558 	 * the smallest component.  This will guarentee that a single
559 	 * interleave table is generated.
560 	 *
561 	 * Lost space must be taken into account when calculating the
562 	 * overall size.  Half the space is lost when CCDF_MIRROR is
563 	 * specified.  One disk is lost when CCDF_PARITY is specified.
564 	 */
565 	if (ccd->ccd_flags & CCDF_UNIFORM) {
566 		for (ci = cs->sc_cinfo;
567 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
568 			ci->ci_size = minsize;
569 		}
570 		if (ccd->ccd_flags & CCDF_MIRROR) {
571 			/*
572 			 * Check to see if an even number of components
573 			 * have been specified.  The interleave must also
574 			 * be non-zero in order for us to be able to
575 			 * guarentee the topology.
576 			 */
577 			if (cs->sc_nccdisks % 2) {
578 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
579 				error = EINVAL;
580 				goto fail;
581 			}
582 			if (cs->sc_ileave == 0) {
583 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
584 				error = EINVAL;
585 				goto fail;
586 			}
587 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
588 		} else if (ccd->ccd_flags & CCDF_PARITY) {
589 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
590 		} else {
591 			if (cs->sc_ileave == 0) {
592 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
593 				error = EINVAL;
594 				goto fail;
595 			}
596 			cs->sc_size = cs->sc_nccdisks * minsize;
597 		}
598 	}
599 
600 	/*
601 	 * Construct the interleave table.
602 	 */
603 	ccdinterleave(cs, ccd->ccd_unit);
604 
605 	/*
606 	 * Create pseudo-geometry based on 1MB cylinders.  It's
607 	 * pretty close.
608 	 */
609 	ccg->ccg_secsize = maxsecsize;
610 	ccg->ccg_ntracks = 1;
611 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
612 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
613 
614 	/*
615 	 * Add an devstat entry for this device.
616 	 */
617 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
618 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
619 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
620 			  DEVSTAT_PRIORITY_ARRAY);
621 
622 	cs->sc_flags |= CCDF_INITED;
623 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
624 	cs->sc_unit = ccd->ccd_unit;
625 	return (0);
626 fail:
627 	while (ci > cs->sc_cinfo) {
628 		ci--;
629 		kfree(ci->ci_path, M_DEVBUF);
630 	}
631 	kfree(cs->sc_cinfo, M_DEVBUF);
632 	cs->sc_cinfo = NULL;
633 	return (error);
634 }
635 
636 static void
637 ccdinterleave(struct ccd_softc *cs, int unit)
638 {
639 	struct ccdcinfo *ci, *smallci;
640 	struct ccdiinfo *ii;
641 	u_int64_t bn;
642 	u_int64_t lbn;
643 	u_int64_t size;
644 	int icount;
645 	int ix;
646 
647 #ifdef DEBUG
648 	if (ccddebug & CCDB_INIT)
649 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
650 #endif
651 
652 	/*
653 	 * Allocate an interleave table.  The worst case occurs when each
654 	 * of N disks is of a different size, resulting in N interleave
655 	 * tables.
656 	 *
657 	 * Chances are this is too big, but we don't care.
658 	 */
659 	icount = cs->sc_nccdisks + 1;
660 	cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
661 				M_DEVBUF, M_WAITOK|M_ZERO);
662 
663 	/*
664 	 * Trivial case: no interleave (actually interleave of disk size).
665 	 * Each table entry represents a single component in its entirety.
666 	 *
667 	 * An interleave of 0 may not be used with a mirror or parity setup.
668 	 */
669 	if (cs->sc_ileave == 0) {
670 		bn = 0;
671 		ii = cs->sc_itable;
672 
673 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
674 			/* Allocate space for ii_index. */
675 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
676 			ii->ii_ndisk = 1;
677 			ii->ii_startblk = bn;
678 			ii->ii_startoff = 0;
679 			ii->ii_index[0] = ix;
680 			bn += cs->sc_cinfo[ix].ci_size;
681 			ii++;
682 		}
683 		ii->ii_ndisk = 0;
684 #ifdef DEBUG
685 		if (ccddebug & CCDB_INIT)
686 			printiinfo(cs->sc_itable);
687 #endif
688 		return;
689 	}
690 
691 	/*
692 	 * The following isn't fast or pretty; it doesn't have to be.
693 	 */
694 	size = 0;
695 	bn = lbn = 0;
696 	for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
697 		/*
698 		 * Allocate space for ii_index.  We might allocate more then
699 		 * we use.
700 		 */
701 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
702 					M_DEVBUF, M_WAITOK);
703 
704 		/*
705 		 * Locate the smallest of the remaining components
706 		 */
707 		smallci = NULL;
708 		ci = cs->sc_cinfo;
709 		while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
710 			if (ci->ci_size > size &&
711 			    (smallci == NULL ||
712 			     ci->ci_size < smallci->ci_size)) {
713 				smallci = ci;
714 			}
715 			++ci;
716 		}
717 
718 		/*
719 		 * Nobody left, all done
720 		 */
721 		if (smallci == NULL) {
722 			ii->ii_ndisk = 0;
723 			break;
724 		}
725 
726 		/*
727 		 * Record starting logical block using an sc_ileave blocksize.
728 		 */
729 		ii->ii_startblk = bn / cs->sc_ileave;
730 
731 		/*
732 		 * Record starting component block using an sc_ileave
733 		 * blocksize.  This value is relative to the beginning of
734 		 * a component disk.
735 		 */
736 		ii->ii_startoff = lbn;
737 
738 		/*
739 		 * Determine how many disks take part in this interleave
740 		 * and record their indices.
741 		 */
742 		ix = 0;
743 		for (ci = cs->sc_cinfo;
744 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
745 			if (ci->ci_size >= smallci->ci_size) {
746 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
747 			}
748 		}
749 		ii->ii_ndisk = ix;
750 
751 		/*
752 		 * Adjust for loop
753 		 */
754 		bn += ix * (smallci->ci_size - size);
755 		lbn = smallci->ci_size / cs->sc_ileave;
756 		size = smallci->ci_size;
757 	}
758 	if (ii == &cs->sc_itable[icount])
759 		panic("ccdinterlave software bug!  table exhausted");
760 #ifdef DEBUG
761 	if (ccddebug & CCDB_INIT)
762 		printiinfo(cs->sc_itable);
763 #endif
764 }
765 
766 /* ARGSUSED */
767 static int
768 ccdopen(struct dev_open_args *ap)
769 {
770 	cdev_t dev = ap->a_head.a_dev;
771 	int unit = ccdunit(dev);
772 	struct ccd_softc *cs;
773 	int error = 0;
774 
775 #ifdef DEBUG
776 	if (ccddebug & CCDB_FOLLOW)
777 		kprintf("ccdopen(%x, %x)\n", dev, flags);
778 #endif
779 	if (unit >= numccd)
780 		return (ENXIO);
781 	cs = &ccd_softc[unit];
782 
783 	if ((error = ccdlock(cs)) == 0) {
784 		ccdunlock(cs);
785 	}
786 	return (error);
787 }
788 
789 /* ARGSUSED */
790 static int
791 ccdclose(struct dev_close_args *ap)
792 {
793 	cdev_t dev = ap->a_head.a_dev;
794 	int unit = ccdunit(dev);
795 	struct ccd_softc *cs;
796 	int error = 0;
797 
798 #ifdef DEBUG
799 	if (ccddebug & CCDB_FOLLOW)
800 		kprintf("ccdclose(%x, %x)\n", dev, flags);
801 #endif
802 
803 	if (unit >= numccd)
804 		return (ENXIO);
805 	cs = &ccd_softc[unit];
806 	if ((error = ccdlock(cs)) == 0) {
807 		ccdunlock(cs);
808 	}
809 	return (error);
810 }
811 
812 static int
813 ccdstrategy(struct dev_strategy_args *ap)
814 {
815 	cdev_t dev = ap->a_head.a_dev;
816 	struct bio *bio = ap->a_bio;
817 	int unit = ccdunit(dev);
818 	struct bio *nbio;
819 	struct buf *bp = bio->bio_buf;
820 	struct ccd_softc *cs = &ccd_softc[unit];
821 	u_int64_t pbn;	/* in sc_secsize chunks */
822 	u_int32_t sz;	/* in sc_secsize chunks */
823 
824 #ifdef DEBUG
825 	if (ccddebug & CCDB_FOLLOW)
826 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
827 #endif
828 	if ((cs->sc_flags & CCDF_INITED) == 0) {
829 		bp->b_error = ENXIO;
830 		goto error;
831 	}
832 
833 	/* If it's a nil transfer, wake up the top half now. */
834 	if (bp->b_bcount == 0) {
835 		bp->b_resid = 0;
836 		goto done;
837 	}
838 
839 	/*
840 	 * Do bounds checking and adjust transfer.  If there's an
841 	 * error, the bounds check will flag that for us.
842 	 */
843 
844 	pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
845 	sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
846 
847 	/*
848 	 * If out of bounds return an error.  If the request goes
849 	 * past EOF, clip the request as appropriate.  If exactly
850 	 * at EOF, return success (don't clip), but with 0 bytes
851 	 * of I/O.
852 	 *
853 	 * Mark EOF B_INVAL (just like bad), indicating that the
854 	 * contents of the buffer, if any, is invalid.
855 	 */
856 	if ((int64_t)pbn < 0)
857 		goto bad;
858 	if (pbn + sz > cs->sc_size) {
859 		if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
860 			goto bad;
861 		if (pbn == cs->sc_size) {
862 			bp->b_resid = bp->b_bcount;
863 			bp->b_flags |= B_INVAL;
864 			goto done;
865 		}
866 		sz = (long)(cs->sc_size - pbn);
867 		bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
868 	}
869 	nbio = bio;
870 
871 	bp->b_resid = bp->b_bcount;
872 	nbio->bio_driver_info = dev;
873 
874 	/*
875 	 * "Start" the unit.
876 	 */
877 	crit_enter();
878 	ccdstart(cs, nbio);
879 	crit_exit();
880 	return(0);
881 
882 	/*
883 	 * note: bio, not nbio, is valid at the done label.
884 	 */
885 bad:
886 	bp->b_error = EINVAL;
887 error:
888 	bp->b_resid = bp->b_bcount;
889 	bp->b_flags |= B_ERROR | B_INVAL;
890 done:
891 	biodone(bio);
892 	return(0);
893 }
894 
895 static void
896 ccdstart(struct ccd_softc *cs, struct bio *bio)
897 {
898 	long bcount, rcount;
899 	struct ccdbuf *cbp[4];
900 	struct buf *bp = bio->bio_buf;
901 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
902 	caddr_t addr;
903 	off_t doffset;
904 
905 #ifdef DEBUG
906 	if (ccddebug & CCDB_FOLLOW)
907 		kprintf("ccdstart(%x, %x)\n", cs, bp);
908 #endif
909 
910 	/* Record the transaction start  */
911 	devstat_start_transaction(&cs->device_stats);
912 
913 	/*
914 	 * Allocate component buffers and fire off the requests
915 	 */
916 	doffset = bio->bio_offset;
917 	addr = bp->b_data;
918 
919 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
920 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
921 		rcount = cbp[0]->cb_buf.b_bcount;
922 
923 		if (cs->sc_cflags & CCDF_MIRROR) {
924 			/*
925 			 * Mirroring.  Writes go to both disks, reads are
926 			 * taken from whichever disk seems most appropriate.
927 			 *
928 			 * We attempt to localize reads to the disk whos arm
929 			 * is nearest the read request.  We ignore seeks due
930 			 * to writes when making this determination and we
931 			 * also try to avoid hogging.
932 			 */
933 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
934 				vn_strategy(cbp[0]->cb_vp,
935 					    &cbp[0]->cb_buf.b_bio1);
936 				vn_strategy(cbp[1]->cb_vp,
937 					    &cbp[1]->cb_buf.b_bio1);
938 			} else {
939 				int pick = cs->sc_pick;
940 				daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
941 				if (doffset < cs->sc_blk[pick] - range ||
942 				    doffset > cs->sc_blk[pick] + range
943 				) {
944 					cs->sc_pick = pick = 1 - pick;
945 				}
946 				cs->sc_blk[pick] = doffset + rcount;
947 				vn_strategy(cbp[pick]->cb_vp,
948 					    &cbp[pick]->cb_buf.b_bio1);
949 			}
950 		} else {
951 			/*
952 			 * Not mirroring
953 			 */
954 			vn_strategy(cbp[0]->cb_vp,
955 				     &cbp[0]->cb_buf.b_bio1);
956 		}
957 		doffset += rcount;
958 		addr += rcount;
959 	}
960 }
961 
962 /*
963  * Build a component buffer header.
964  */
965 static void
966 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
967 	  off_t doffset, caddr_t addr, long bcount)
968 {
969 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
970 	struct ccdbuf *cbp;
971 	u_int64_t bn;
972 	u_int64_t cbn;
973 	u_int64_t cboff;
974 	off_t cbc;
975 
976 #ifdef DEBUG
977 	if (ccddebug & CCDB_IO)
978 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
979 		       cs, bp, bn, addr, bcount);
980 #endif
981 	/*
982 	 * Determine which component bn falls in.
983 	 */
984 	bn = doffset / cs->sc_geom.ccg_secsize;
985 	cbn = bn;
986 	cboff = 0;
987 
988 	if (cs->sc_ileave == 0) {
989 		/*
990 		 * Serially concatenated and neither a mirror nor a parity
991 		 * config.  This is a special case.
992 		 */
993 		daddr_t sblk;
994 
995 		sblk = 0;
996 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
997 			sblk += ci->ci_size;
998 		cbn -= sblk;
999 	} else {
1000 		struct ccdiinfo *ii;
1001 		int ccdisk, off;
1002 
1003 		/*
1004 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
1005 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
1006 		 * to cbn.
1007 		 */
1008 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
1009 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
1010 
1011 		/*
1012 		 * Figure out which interleave table to use.
1013 		 */
1014 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1015 			if (ii->ii_startblk > cbn)
1016 				break;
1017 		}
1018 		ii--;
1019 
1020 		/*
1021 		 * off is the logical superblock relative to the beginning
1022 		 * of this interleave block.
1023 		 */
1024 		off = cbn - ii->ii_startblk;
1025 
1026 		/*
1027 		 * We must calculate which disk component to use (ccdisk),
1028 		 * and recalculate cbn to be the superblock relative to
1029 		 * the beginning of the component.  This is typically done by
1030 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1031 		 * must typically be divided by the number of components in
1032 		 * this interleave array to be properly convert it from a
1033 		 * CCD-relative logical superblock number to a
1034 		 * component-relative superblock number.
1035 		 */
1036 		if (ii->ii_ndisk == 1) {
1037 			/*
1038 			 * When we have just one disk, it can't be a mirror
1039 			 * or a parity config.
1040 			 */
1041 			ccdisk = ii->ii_index[0];
1042 			cbn = ii->ii_startoff + off;
1043 		} else {
1044 			if (cs->sc_cflags & CCDF_MIRROR) {
1045 				/*
1046 				 * We have forced a uniform mapping, resulting
1047 				 * in a single interleave array.  We double
1048 				 * up on the first half of the available
1049 				 * components and our mirror is in the second
1050 				 * half.  This only works with a single
1051 				 * interleave array because doubling up
1052 				 * doubles the number of sectors, so there
1053 				 * cannot be another interleave array because
1054 				 * the next interleave array's calculations
1055 				 * would be off.
1056 				 */
1057 				int ndisk2 = ii->ii_ndisk / 2;
1058 				ccdisk = ii->ii_index[off % ndisk2];
1059 				cbn = ii->ii_startoff + off / ndisk2;
1060 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1061 			} else if (cs->sc_cflags & CCDF_PARITY) {
1062 				/*
1063 				 * XXX not implemented yet
1064 				 */
1065 				int ndisk2 = ii->ii_ndisk - 1;
1066 				ccdisk = ii->ii_index[off % ndisk2];
1067 				cbn = ii->ii_startoff + off / ndisk2;
1068 				if (cbn % ii->ii_ndisk <= ccdisk)
1069 					ccdisk++;
1070 			} else {
1071 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1072 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1073 			}
1074 		}
1075 
1076 		ci = &cs->sc_cinfo[ccdisk];
1077 
1078 		/*
1079 		 * Convert cbn from a superblock to a normal block so it
1080 		 * can be used to calculate (along with cboff) the normal
1081 		 * block index into this particular disk.
1082 		 */
1083 		cbn *= cs->sc_ileave;
1084 	}
1085 
1086 	/*
1087 	 * Fill in the component buf structure.
1088 	 *
1089 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1090 	 * will be truncated on device EOF so we use b_bufsize to detect
1091 	 * the case.
1092 	 */
1093 	cbp = getccdbuf();
1094 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1095 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1096 	cbp->cb_buf.b_data = addr;
1097 	cbp->cb_vp = ci->ci_vp;
1098 	if (cs->sc_ileave == 0)
1099 		cbc = dbtob((off_t)(ci->ci_size - cbn));
1100 	else
1101 		cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1102 	if (cbc > cs->sc_maxiosize)
1103 		cbc = cs->sc_maxiosize;
1104 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1105  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1106 
1107 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1108 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1109 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1110 
1111 	/*
1112 	 * context for ccdiodone
1113 	 */
1114 	cbp->cb_obio = bio;
1115 	cbp->cb_unit = cs - ccd_softc;
1116 	cbp->cb_comp = ci - cs->sc_cinfo;
1117 
1118 #ifdef DEBUG
1119 	if (ccddebug & CCDB_IO)
1120 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1121 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1122 		       cbp->cb_buf.b_bio1.bio_offset,
1123 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1124 #endif
1125 	cb[0] = cbp;
1126 
1127 	/*
1128 	 * Note: both I/O's setup when reading from mirror, but only one
1129 	 * will be executed.
1130 	 */
1131 	if (cs->sc_cflags & CCDF_MIRROR) {
1132 		/* mirror, setup second I/O */
1133 		cbp = getccdbuf();
1134 
1135 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1136 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1137 		cbp->cb_buf.b_data = addr;
1138 		cbp->cb_vp = ci2->ci_vp;
1139 		if (cs->sc_ileave == 0)
1140 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1141 		else
1142 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1143 		if (cbc > cs->sc_maxiosize)
1144 			cbc = cs->sc_maxiosize;
1145 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1146 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1147 
1148 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1149 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1150 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1151 
1152 		/*
1153 		 * context for ccdiodone
1154 		 */
1155 		cbp->cb_obio = bio;
1156 		cbp->cb_unit = cs - ccd_softc;
1157 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1158 		cb[1] = cbp;
1159 		/* link together the ccdbuf's and clear "mirror done" flag */
1160 		cb[0]->cb_mirror = cb[1];
1161 		cb[1]->cb_mirror = cb[0];
1162 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1163 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1164 	}
1165 }
1166 
1167 static void
1168 ccdintr(struct ccd_softc *cs, struct bio *bio)
1169 {
1170 	struct buf *bp = bio->bio_buf;
1171 
1172 #ifdef DEBUG
1173 	if (ccddebug & CCDB_FOLLOW)
1174 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1175 #endif
1176 	/*
1177 	 * Request is done for better or worse, wakeup the top half.
1178 	 */
1179 	if (bp->b_flags & B_ERROR)
1180 		bp->b_resid = bp->b_bcount;
1181 	devstat_end_transaction_buf(&cs->device_stats, bp);
1182 	biodone(bio);
1183 }
1184 
1185 /*
1186  * Called at interrupt time.
1187  *
1188  * Mark the component as done and if all components are done,
1189  * take a ccd interrupt.
1190  */
1191 static void
1192 ccdiodone(struct bio *bio)
1193 {
1194 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1195 	struct bio *obio = cbp->cb_obio;
1196 	struct buf *obp = obio->bio_buf;
1197 	int unit = cbp->cb_unit;
1198 	int count;
1199 
1200 	/*
1201 	 * Since we do not have exclusive access to underlying devices,
1202 	 * we can't keep cache translations around.
1203 	 */
1204 	clearbiocache(bio->bio_next);
1205 
1206 	get_mplock();
1207 	crit_enter();
1208 #ifdef DEBUG
1209 	if (ccddebug & CCDB_FOLLOW)
1210 		kprintf("ccdiodone(%x)\n", cbp);
1211 	if (ccddebug & CCDB_IO) {
1212 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1213 		       obp, obp->b_bcount, obp->b_resid);
1214 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1215 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1216 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1217 		       cbp->cb_buf.b_bcount);
1218 	}
1219 #endif
1220 
1221 	/*
1222 	 * If an error occured, report it.  If this is a mirrored
1223 	 * configuration and the first of two possible reads, do not
1224 	 * set the error in the bp yet because the second read may
1225 	 * succeed.
1226 	 */
1227 	if (cbp->cb_buf.b_flags & B_ERROR) {
1228 		const char *msg = "";
1229 
1230 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1231 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1232 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1233 			/*
1234 			 * We will try our read on the other disk down
1235 			 * below, also reverse the default pick so if we
1236 			 * are doing a scan we do not keep hitting the
1237 			 * bad disk first.
1238 			 */
1239 			struct ccd_softc *cs = &ccd_softc[unit];
1240 
1241 			msg = ", trying other disk";
1242 			cs->sc_pick = 1 - cs->sc_pick;
1243 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1244 		} else {
1245 			obp->b_flags |= B_ERROR;
1246 			obp->b_error = cbp->cb_buf.b_error ?
1247 			    cbp->cb_buf.b_error : EIO;
1248 		}
1249 		kprintf("ccd%d: error %d on component %d "
1250 			"offset %jd (ccd offset %jd)%s\n",
1251 		        unit, obp->b_error, cbp->cb_comp,
1252 		        (intmax_t)cbp->cb_buf.b_bio2.bio_offset,
1253 		        (intmax_t)obio->bio_offset,
1254 		        msg);
1255 	}
1256 
1257 	/*
1258 	 * Process mirror.  If we are writing, I/O has been initiated on both
1259 	 * buffers and we fall through only after both are finished.
1260 	 *
1261 	 * If we are reading only one I/O is initiated at a time.  If an
1262 	 * error occurs we initiate the second I/O and return, otherwise
1263 	 * we free the second I/O without initiating it.
1264 	 */
1265 
1266 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1267 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1268 			/*
1269 			 * When writing, handshake with the second buffer
1270 			 * to determine when both are done.  If both are not
1271 			 * done, return here.
1272 			 */
1273 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1274 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1275 				putccdbuf(cbp);
1276 				crit_exit();
1277 				rel_mplock();
1278 				return;
1279 			}
1280 		} else {
1281 			/*
1282 			 * When reading, either dispose of the second buffer
1283 			 * or initiate I/O on the second buffer if an error
1284 			 * occured with this one.
1285 			 */
1286 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1287 				if (cbp->cb_buf.b_flags & B_ERROR) {
1288 					cbp->cb_mirror->cb_pflags |=
1289 					    CCDPF_MIRROR_DONE;
1290 					vn_strategy(
1291 					    cbp->cb_mirror->cb_vp,
1292 					    &cbp->cb_mirror->cb_buf.b_bio1
1293 					);
1294 					putccdbuf(cbp);
1295 					crit_exit();
1296 					rel_mplock();
1297 					return;
1298 				} else {
1299 					putccdbuf(cbp->cb_mirror);
1300 					/* fall through */
1301 				}
1302 			}
1303 		}
1304 	}
1305 
1306 	/*
1307 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1308 	 */
1309 	count = cbp->cb_buf.b_bufsize;
1310 	putccdbuf(cbp);
1311 
1312 	/*
1313 	 * If all done, "interrupt".
1314 	 */
1315 	obp->b_resid -= count;
1316 	if (obp->b_resid < 0)
1317 		panic("ccdiodone: count");
1318 	if (obp->b_resid == 0)
1319 		ccdintr(&ccd_softc[unit], obio);
1320 	crit_exit();
1321 	rel_mplock();
1322 }
1323 
1324 static int
1325 ccdioctl(struct dev_ioctl_args *ap)
1326 {
1327 	cdev_t dev = ap->a_head.a_dev;
1328 	int unit = ccdunit(dev);
1329 	int i, j, lookedup = 0, error = 0;
1330 	struct ccd_softc *cs;
1331 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1332 	struct ccddevice ccd;
1333 	struct disk_info info;
1334 	char **cpp;
1335 	struct vnode **vpp;
1336 
1337 	if (unit >= numccd)
1338 		return (ENXIO);
1339 	cs = &ccd_softc[unit];
1340 
1341 	bzero(&ccd, sizeof(ccd));
1342 
1343 	switch (ap->a_cmd) {
1344 	case CCDIOCSET:
1345 		if (cs->sc_flags & CCDF_INITED)
1346 			return (EBUSY);
1347 
1348 		if ((ap->a_fflag & FWRITE) == 0)
1349 			return (EBADF);
1350 
1351 		if ((error = ccdlock(cs)) != 0)
1352 			return (error);
1353 
1354 		if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1355 			ccdunlock(cs);
1356 			return (EINVAL);
1357 		}
1358 
1359 		/* Fill in some important bits. */
1360 		ccd.ccd_unit = unit;
1361 		ccd.ccd_interleave = ccio->ccio_ileave;
1362 		if (ccd.ccd_interleave == 0 &&
1363 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1364 		     (ccio->ccio_flags & CCDF_PARITY))) {
1365 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1366 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1367 		}
1368 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1369 		    (ccio->ccio_flags & CCDF_PARITY)) {
1370 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1371 			ccio->ccio_flags &= ~CCDF_PARITY;
1372 		}
1373 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1374 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1375 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1376 			       unit);
1377 			ccio->ccio_flags |= CCDF_UNIFORM;
1378 		}
1379 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1380 
1381 		/*
1382 		 * Allocate space for and copy in the array of
1383 		 * componet pathnames and device numbers.
1384 		 */
1385 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1386 		    M_DEVBUF, M_WAITOK);
1387 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1388 		    M_DEVBUF, M_WAITOK);
1389 
1390 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1391 				ccio->ccio_ndisks * sizeof(char **));
1392 		if (error) {
1393 			kfree(vpp, M_DEVBUF);
1394 			kfree(cpp, M_DEVBUF);
1395 			ccdunlock(cs);
1396 			return (error);
1397 		}
1398 
1399 #ifdef DEBUG
1400 		if (ccddebug & CCDB_INIT) {
1401 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1402 				kprintf("ccdioctl: component %d: 0x%x\n",
1403 				    i, cpp[i]);
1404 		}
1405 #endif
1406 
1407 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1408 #ifdef DEBUG
1409 			if (ccddebug & CCDB_INIT)
1410 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1411 #endif
1412 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1413 				for (j = 0; j < lookedup; ++j)
1414 					(void)vn_close(vpp[j], FREAD|FWRITE);
1415 				kfree(vpp, M_DEVBUF);
1416 				kfree(cpp, M_DEVBUF);
1417 				ccdunlock(cs);
1418 				return (error);
1419 			}
1420 			++lookedup;
1421 		}
1422 		ccd.ccd_cpp = cpp;
1423 		ccd.ccd_vpp = vpp;
1424 		ccd.ccd_ndev = ccio->ccio_ndisks;
1425 
1426 		/*
1427 		 * Initialize the ccd.  Fills in the softc for us.
1428 		 */
1429 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1430 			for (j = 0; j < lookedup; ++j)
1431 				(void)vn_close(vpp[j], FREAD|FWRITE);
1432 			kfree(vpp, M_DEVBUF);
1433 			kfree(cpp, M_DEVBUF);
1434 			ccdunlock(cs);
1435 			return (error);
1436 		}
1437 
1438 		/*
1439 		 * The ccd has been successfully initialized, so
1440 		 * we can place it into the array and read the disklabel.
1441 		 */
1442 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1443 		ccio->ccio_unit = unit;
1444 		ccio->ccio_size = cs->sc_size;
1445 
1446 		bzero(&info, sizeof(info));
1447 		info.d_media_blksize = cs->sc_geom.ccg_secsize;
1448 		info.d_media_blocks  = cs->sc_size;
1449 		info.d_nheads	     = cs->sc_geom.ccg_ntracks;
1450 		info.d_secpertrack   = cs->sc_geom.ccg_nsectors;
1451 		info.d_ncylinders    = cs->sc_geom.ccg_ncylinders;
1452 		info.d_secpercyl     = info.d_nheads * info.d_secpertrack;
1453 
1454 		/*
1455 		 * For cases where a label is directly applied to the ccd,
1456 		 * without slices, DSO_COMPATMBR forces one sector be
1457 		 * reserved for backwards compatibility.
1458 		 */
1459 		info.d_dsflags	     = DSO_COMPATMBR;
1460 		disk_setdiskinfo(&cs->sc_disk, &info);
1461 
1462 		ccdunlock(cs);
1463 
1464 		break;
1465 
1466 	case CCDIOCCLR:
1467 		if ((cs->sc_flags & CCDF_INITED) == 0)
1468 			return (ENXIO);
1469 
1470 		if ((ap->a_fflag & FWRITE) == 0)
1471 			return (EBADF);
1472 
1473 		if ((error = ccdlock(cs)) != 0)
1474 			return (error);
1475 
1476 		if (dev_drefs(cs->sc_dev) > 1) {
1477 			ccdunlock(cs);
1478 			return (EBUSY);
1479 		}
1480 
1481 		/*
1482 		 * Free ccd_softc information and clear entry.
1483 		 */
1484 
1485 		/* Close the components and free their pathnames. */
1486 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1487 			/*
1488 			 * XXX: this close could potentially fail and
1489 			 * cause Bad Things.  Maybe we need to force
1490 			 * the close to happen?
1491 			 */
1492 #ifdef DEBUG
1493 			if (ccddebug & CCDB_VNODE)
1494 				vprint("CCDIOCCLR: vnode info",
1495 				    cs->sc_cinfo[i].ci_vp);
1496 #endif
1497 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1498 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1499 		}
1500 
1501 		/* Free interleave index. */
1502 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1503 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1504 
1505 		/* Free component info and interleave table. */
1506 		kfree(cs->sc_cinfo, M_DEVBUF);
1507 		kfree(cs->sc_itable, M_DEVBUF);
1508 		cs->sc_cinfo = NULL;
1509 		cs->sc_itable = NULL;
1510 		cs->sc_flags &= ~CCDF_INITED;
1511 
1512 		/*
1513 		 * Free ccddevice information and clear entry.
1514 		 */
1515 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1516 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1517 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1518 
1519 		/*
1520 		 * And remove the devstat entry.
1521 		 */
1522 		devstat_remove_entry(&cs->device_stats);
1523 
1524 		/* This must be atomic. */
1525 		crit_enter();
1526 		ccdunlock(cs);
1527 		crit_exit();
1528 
1529 		break;
1530 
1531 	default:
1532 		return (ENOTTY);
1533 	}
1534 
1535 	return (0);
1536 }
1537 
1538 static int
1539 ccddump(struct dev_dump_args *ap)
1540 {
1541 	/* Not implemented. */
1542 	return ENXIO;
1543 }
1544 
1545 /*
1546  * Lookup the provided name in the filesystem.  If the file exists,
1547  * is a valid block device, and isn't being used by anyone else,
1548  * set *vpp to the file's vnode.
1549  */
1550 static int
1551 ccdlookup(char *path, struct vnode **vpp)
1552 {
1553 	struct nlookupdata nd;
1554 	struct vnode *vp;
1555 	int error;
1556 
1557 	*vpp = NULL;
1558 
1559 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1560 	if (error)
1561 		return (error);
1562 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1563 #ifdef DEBUG
1564 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1565 			kprintf("ccdlookup: vn_open error = %d\n", error);
1566 #endif
1567 		goto done;
1568 	}
1569 	vp = nd.nl_open_vp;
1570 
1571 	if (vp->v_opencount > 1) {
1572 		error = EBUSY;
1573 		goto done;
1574 	}
1575 
1576 	if (!vn_isdisk(vp, &error))
1577 		goto done;
1578 
1579 #ifdef DEBUG
1580 	if (ccddebug & CCDB_VNODE)
1581 		vprint("ccdlookup: vnode info", vp);
1582 #endif
1583 
1584 	vn_unlock(vp);
1585 	nd.nl_open_vp = NULL;
1586 	nlookup_done(&nd);
1587 	*vpp = vp;				/* leave ref intact  */
1588 	return (0);
1589 done:
1590 	nlookup_done(&nd);
1591 	return (error);
1592 }
1593 
1594 /*
1595  * Wait interruptibly for an exclusive lock.
1596  *
1597  * XXX
1598  * Several drivers do this; it should be abstracted and made MP-safe.
1599  */
1600 static int
1601 ccdlock(struct ccd_softc *cs)
1602 {
1603 	int error;
1604 
1605 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1606 		cs->sc_flags |= CCDF_WANTED;
1607 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1608 			return (error);
1609 	}
1610 	cs->sc_flags |= CCDF_LOCKED;
1611 	return (0);
1612 }
1613 
1614 /*
1615  * Unlock and wake up any waiters.
1616  */
1617 static void
1618 ccdunlock(struct ccd_softc *cs)
1619 {
1620 
1621 	cs->sc_flags &= ~CCDF_LOCKED;
1622 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1623 		cs->sc_flags &= ~CCDF_WANTED;
1624 		wakeup(cs);
1625 	}
1626 }
1627 
1628 #ifdef DEBUG
1629 static void
1630 printiinfo(struct ccdiinfo *ii)
1631 {
1632 	int ix, i;
1633 
1634 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1635 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1636 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1637 		for (i = 0; i < ii->ii_ndisk; i++)
1638 			kprintf(" %d", ii->ii_index[i]);
1639 		kprintf("\n");
1640 	}
1641 }
1642 #endif
1643 
1644 
1645 /* Local Variables: */
1646 /* c-argdecl-indent: 8 */
1647 /* c-continued-statement-offset: 8 */
1648 /* c-indent-level: 8 */
1649 /* End: */
1650