xref: /dflybsd-src/sys/dev/disk/ccd/ccd.c (revision 41871674d0079dec70d55eb824f39d07dc7b3310)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.25 2006/04/03 02:02:32 dillon Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
108 #include <sys/buf2.h>
109 #include <sys/ccdvar.h>
110 
111 #include <vm/vm_zone.h>
112 
113 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
114 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
115 
116 #include <sys/thread2.h>
117 
118 #if defined(CCDDEBUG) && !defined(DEBUG)
119 #define DEBUG
120 #endif
121 
122 #ifdef DEBUG
123 #define CCDB_FOLLOW	0x01
124 #define CCDB_INIT	0x02
125 #define CCDB_IO		0x04
126 #define CCDB_LABEL	0x08
127 #define CCDB_VNODE	0x10
128 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
129     CCDB_VNODE;
130 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
131 #undef DEBUG
132 #endif
133 
134 #define	ccdunit(x)	dkunit(x)
135 #define ccdpart(x)	dkpart(x)
136 
137 /*
138    This is how mirroring works (only writes are special):
139 
140    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
141    linked together by the cb_mirror field.  "cb_pflags &
142    CCDPF_MIRROR_DONE" is set to 0 on both of them.
143 
144    When a component returns to ccdiodone(), it checks if "cb_pflags &
145    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
146    flag and returns.  If it is, it means its partner has already
147    returned, so it will go to the regular cleanup.
148 
149  */
150 
151 struct ccdbuf {
152 	struct buf	cb_buf;		/* new I/O buf */
153 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
154 	struct ccdbuf	*cb_freenext;	/* free list link */
155 	int		cb_unit;	/* target unit */
156 	int		cb_comp;	/* target component */
157 	int		cb_pflags;	/* mirror/parity status flag */
158 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
159 };
160 
161 /* bits in cb_pflags */
162 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
163 
164 #define CCDLABELDEV(dev)	\
165 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
166 
167 static d_open_t ccdopen;
168 static d_close_t ccdclose;
169 static d_strategy_t ccdstrategy;
170 static d_ioctl_t ccdioctl;
171 static d_dump_t ccddump;
172 static d_psize_t ccdsize;
173 
174 #define NCCDFREEHIWAT	16
175 
176 #define CDEV_MAJOR 74
177 
178 static struct cdevsw ccd_cdevsw = {
179 	/* name */	"ccd",
180 	/* maj */	CDEV_MAJOR,
181 	/* flags */	D_DISK,
182 	/* port */      NULL,
183 	/* clone */	NULL,
184 
185 	/* open */	ccdopen,
186 	/* close */	ccdclose,
187 	/* read */	physread,
188 	/* write */	physwrite,
189 	/* ioctl */	ccdioctl,
190 	/* poll */	nopoll,
191 	/* mmap */	nommap,
192 	/* strategy */	ccdstrategy,
193 	/* dump */	ccddump,
194 	/* psize */	ccdsize
195 };
196 
197 /* called during module initialization */
198 static	void ccdattach (void);
199 static	int ccd_modevent (module_t, int, void *);
200 
201 /* called by biodone() at interrupt time */
202 static	void ccdiodone (struct bio *bio);
203 
204 static	void ccdstart (struct ccd_softc *, struct bio *);
205 static	void ccdinterleave (struct ccd_softc *, int);
206 static	void ccdintr (struct ccd_softc *, struct bio *);
207 static	int ccdinit (struct ccddevice *, char **, struct thread *);
208 static	int ccdlookup (char *, struct thread *td, struct vnode **);
209 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
210 		struct bio *, off_t, caddr_t, long);
211 static	void ccdgetdisklabel (dev_t);
212 static	void ccdmakedisklabel (struct ccd_softc *);
213 static	int ccdlock (struct ccd_softc *);
214 static	void ccdunlock (struct ccd_softc *);
215 
216 #ifdef DEBUG
217 static	void printiinfo (struct ccdiinfo *);
218 #endif
219 
220 /* Non-private for the benefit of libkvm. */
221 struct	ccd_softc *ccd_softc;
222 struct	ccddevice *ccddevs;
223 struct	ccdbuf *ccdfreebufs;
224 static	int numccdfreebufs;
225 static	int numccd = 0;
226 
227 /*
228  * getccdbuf() -	Allocate and zero a ccd buffer.
229  *
230  *	This routine is called at splbio().
231  */
232 
233 static __inline
234 struct ccdbuf *
235 getccdbuf(void)
236 {
237 	struct ccdbuf *cbp;
238 
239 	/*
240 	 * Allocate from freelist or malloc as necessary
241 	 */
242 	if ((cbp = ccdfreebufs) != NULL) {
243 		ccdfreebufs = cbp->cb_freenext;
244 		--numccdfreebufs;
245 		reinitbufbio(&cbp->cb_buf);
246 	} else {
247 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
248 		initbufbio(&cbp->cb_buf);
249 	}
250 
251 	/*
252 	 * independant struct buf initialization
253 	 */
254 	LIST_INIT(&cbp->cb_buf.b_dep);
255 	BUF_LOCKINIT(&cbp->cb_buf);
256 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
257 	BUF_KERNPROC(&cbp->cb_buf);
258 
259 	return(cbp);
260 }
261 
262 /*
263  * putccdbuf() -	Free a ccd buffer.
264  *
265  *	This routine is called at splbio().
266  */
267 
268 static __inline
269 void
270 putccdbuf(struct ccdbuf *cbp)
271 {
272 	BUF_UNLOCK(&cbp->cb_buf);
273 	BUF_LOCKFREE(&cbp->cb_buf);
274 
275 	if (numccdfreebufs < NCCDFREEHIWAT) {
276 		cbp->cb_freenext = ccdfreebufs;
277 		ccdfreebufs = cbp;
278 		++numccdfreebufs;
279 	} else {
280 		free((caddr_t)cbp, M_DEVBUF);
281 	}
282 }
283 
284 
285 /*
286  * Number of blocks to untouched in front of a component partition.
287  * This is to avoid violating its disklabel area when it starts at the
288  * beginning of the slice.
289  */
290 #if !defined(CCD_OFFSET)
291 #define CCD_OFFSET 16
292 #endif
293 
294 /*
295  * Called by main() during pseudo-device attachment.  All we need
296  * to do is allocate enough space for devices to be configured later, and
297  * add devsw entries.
298  */
299 static void
300 ccdattach(void)
301 {
302 	int i;
303 	int num = NCCD;
304 
305 	if (num > 1)
306 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
307 	else
308 		printf("ccd0: Concatenated disk driver\n");
309 
310 	ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
311 			    M_WAITOK | M_ZERO);
312 	ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
313 			    M_WAITOK | M_ZERO);
314 	numccd = num;
315 
316 	cdevsw_add(&ccd_cdevsw, 0, 0);
317 	/* XXX: is this necessary? */
318 	for (i = 0; i < numccd; ++i)
319 		ccddevs[i].ccd_dk = -1;
320 }
321 
322 static int
323 ccd_modevent(module_t mod, int type, void *data)
324 {
325 	int error = 0;
326 
327 	switch (type) {
328 	case MOD_LOAD:
329 		ccdattach();
330 		break;
331 
332 	case MOD_UNLOAD:
333 		printf("ccd0: Unload not supported!\n");
334 		error = EOPNOTSUPP;
335 		break;
336 
337 	default:	/* MOD_SHUTDOWN etc */
338 		break;
339 	}
340 	return (error);
341 }
342 
343 DEV_MODULE(ccd, ccd_modevent, NULL);
344 
345 static int
346 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
347 {
348 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
349 	struct ccdcinfo *ci = NULL;	/* XXX */
350 	size_t size;
351 	int ix;
352 	struct vnode *vp;
353 	size_t minsize;
354 	int maxsecsize;
355 	struct partinfo dpart;
356 	struct ccdgeom *ccg = &cs->sc_geom;
357 	char tmppath[MAXPATHLEN];
358 	int error = 0;
359 	struct ucred *cred;
360 
361 	KKASSERT(td->td_proc);
362 	cred = td->td_proc->p_ucred;
363 
364 #ifdef DEBUG
365 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
366 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
367 #endif
368 
369 	cs->sc_size = 0;
370 	cs->sc_ileave = ccd->ccd_interleave;
371 	cs->sc_nccdisks = ccd->ccd_ndev;
372 
373 	/* Allocate space for the component info. */
374 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
375 	    M_DEVBUF, M_WAITOK);
376 
377 	/*
378 	 * Verify that each component piece exists and record
379 	 * relevant information about it.
380 	 */
381 	maxsecsize = 0;
382 	minsize = 0;
383 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
384 		vp = ccd->ccd_vpp[ix];
385 		ci = &cs->sc_cinfo[ix];
386 		ci->ci_vp = vp;
387 
388 		/*
389 		 * Copy in the pathname of the component.
390 		 */
391 		bzero(tmppath, sizeof(tmppath));	/* sanity */
392 		if ((error = copyinstr(cpaths[ix], tmppath,
393 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
394 #ifdef DEBUG
395 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
396 				printf("ccd%d: can't copy path, error = %d\n",
397 				    ccd->ccd_unit, error);
398 #endif
399 			goto fail;
400 		}
401 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
402 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
403 
404 		ci->ci_dev = vn_todev(vp);
405 
406 		/*
407 		 * Get partition information for the component.
408 		 */
409 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
410 		    FREAD, cred, td)) != 0) {
411 #ifdef DEBUG
412 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
413 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
414 				     ccd->ccd_unit, ci->ci_path, error);
415 #endif
416 			goto fail;
417 		}
418 		if (dpart.part->p_fstype == FS_BSDFFS) {
419 			maxsecsize =
420 			    ((dpart.disklab->d_secsize > maxsecsize) ?
421 			    dpart.disklab->d_secsize : maxsecsize);
422 			size = dpart.part->p_size - CCD_OFFSET;
423 		} else {
424 #ifdef DEBUG
425 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
426 				printf("ccd%d: %s: incorrect partition type\n",
427 				    ccd->ccd_unit, ci->ci_path);
428 #endif
429 			error = EFTYPE;
430 			goto fail;
431 		}
432 
433 		/*
434 		 * Calculate the size, truncating to an interleave
435 		 * boundary if necessary.
436 		 */
437 
438 		if (cs->sc_ileave > 1)
439 			size -= size % cs->sc_ileave;
440 
441 		if (size == 0) {
442 #ifdef DEBUG
443 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
444 				printf("ccd%d: %s: size == 0\n",
445 				    ccd->ccd_unit, ci->ci_path);
446 #endif
447 			error = ENODEV;
448 			goto fail;
449 		}
450 
451 		if (minsize == 0 || size < minsize)
452 			minsize = size;
453 		ci->ci_size = size;
454 		cs->sc_size += size;
455 	}
456 
457 	/*
458 	 * Don't allow the interleave to be smaller than
459 	 * the biggest component sector.
460 	 */
461 	if ((cs->sc_ileave > 0) &&
462 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
463 #ifdef DEBUG
464 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
465 			printf("ccd%d: interleave must be at least %d\n",
466 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
467 #endif
468 		error = EINVAL;
469 		goto fail;
470 	}
471 
472 	/*
473 	 * If uniform interleave is desired set all sizes to that of
474 	 * the smallest component.  This will guarentee that a single
475 	 * interleave table is generated.
476 	 *
477 	 * Lost space must be taken into account when calculating the
478 	 * overall size.  Half the space is lost when CCDF_MIRROR is
479 	 * specified.  One disk is lost when CCDF_PARITY is specified.
480 	 */
481 	if (ccd->ccd_flags & CCDF_UNIFORM) {
482 		for (ci = cs->sc_cinfo;
483 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
484 			ci->ci_size = minsize;
485 		}
486 		if (ccd->ccd_flags & CCDF_MIRROR) {
487 			/*
488 			 * Check to see if an even number of components
489 			 * have been specified.  The interleave must also
490 			 * be non-zero in order for us to be able to
491 			 * guarentee the topology.
492 			 */
493 			if (cs->sc_nccdisks % 2) {
494 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
495 				error = EINVAL;
496 				goto fail;
497 			}
498 			if (cs->sc_ileave == 0) {
499 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
500 				error = EINVAL;
501 				goto fail;
502 			}
503 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
504 		} else if (ccd->ccd_flags & CCDF_PARITY) {
505 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
506 		} else {
507 			if (cs->sc_ileave == 0) {
508 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
509 				error = EINVAL;
510 				goto fail;
511 			}
512 			cs->sc_size = cs->sc_nccdisks * minsize;
513 		}
514 	}
515 
516 	/*
517 	 * Construct the interleave table.
518 	 */
519 	ccdinterleave(cs, ccd->ccd_unit);
520 
521 	/*
522 	 * Create pseudo-geometry based on 1MB cylinders.  It's
523 	 * pretty close.
524 	 */
525 	ccg->ccg_secsize = maxsecsize;
526 	ccg->ccg_ntracks = 1;
527 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
528 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
529 
530 	/*
531 	 * Add an devstat entry for this device.
532 	 */
533 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
534 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
535 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
536 			  DEVSTAT_PRIORITY_ARRAY);
537 
538 	cs->sc_flags |= CCDF_INITED;
539 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
540 	cs->sc_unit = ccd->ccd_unit;
541 	return (0);
542 fail:
543 	while (ci > cs->sc_cinfo) {
544 		ci--;
545 		free(ci->ci_path, M_DEVBUF);
546 	}
547 	free(cs->sc_cinfo, M_DEVBUF);
548 	return (error);
549 }
550 
551 static void
552 ccdinterleave(struct ccd_softc *cs, int unit)
553 {
554 	struct ccdcinfo *ci, *smallci;
555 	struct ccdiinfo *ii;
556 	daddr_t bn, lbn;
557 	int ix;
558 	u_long size;
559 
560 #ifdef DEBUG
561 	if (ccddebug & CCDB_INIT)
562 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
563 #endif
564 
565 	/*
566 	 * Allocate an interleave table.  The worst case occurs when each
567 	 * of N disks is of a different size, resulting in N interleave
568 	 * tables.
569 	 *
570 	 * Chances are this is too big, but we don't care.
571 	 */
572 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
573 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
574 	bzero((caddr_t)cs->sc_itable, size);
575 
576 	/*
577 	 * Trivial case: no interleave (actually interleave of disk size).
578 	 * Each table entry represents a single component in its entirety.
579 	 *
580 	 * An interleave of 0 may not be used with a mirror or parity setup.
581 	 */
582 	if (cs->sc_ileave == 0) {
583 		bn = 0;
584 		ii = cs->sc_itable;
585 
586 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
587 			/* Allocate space for ii_index. */
588 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
589 			ii->ii_ndisk = 1;
590 			ii->ii_startblk = bn;
591 			ii->ii_startoff = 0;
592 			ii->ii_index[0] = ix;
593 			bn += cs->sc_cinfo[ix].ci_size;
594 			ii++;
595 		}
596 		ii->ii_ndisk = 0;
597 #ifdef DEBUG
598 		if (ccddebug & CCDB_INIT)
599 			printiinfo(cs->sc_itable);
600 #endif
601 		return;
602 	}
603 
604 	/*
605 	 * The following isn't fast or pretty; it doesn't have to be.
606 	 */
607 	size = 0;
608 	bn = lbn = 0;
609 	for (ii = cs->sc_itable; ; ii++) {
610 		/*
611 		 * Allocate space for ii_index.  We might allocate more then
612 		 * we use.
613 		 */
614 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
615 		    M_DEVBUF, M_WAITOK);
616 
617 		/*
618 		 * Locate the smallest of the remaining components
619 		 */
620 		smallci = NULL;
621 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
622 		    ci++) {
623 			if (ci->ci_size > size &&
624 			    (smallci == NULL ||
625 			     ci->ci_size < smallci->ci_size)) {
626 				smallci = ci;
627 			}
628 		}
629 
630 		/*
631 		 * Nobody left, all done
632 		 */
633 		if (smallci == NULL) {
634 			ii->ii_ndisk = 0;
635 			break;
636 		}
637 
638 		/*
639 		 * Record starting logical block using an sc_ileave blocksize.
640 		 */
641 		ii->ii_startblk = bn / cs->sc_ileave;
642 
643 		/*
644 		 * Record starting comopnent block using an sc_ileave
645 		 * blocksize.  This value is relative to the beginning of
646 		 * a component disk.
647 		 */
648 		ii->ii_startoff = lbn;
649 
650 		/*
651 		 * Determine how many disks take part in this interleave
652 		 * and record their indices.
653 		 */
654 		ix = 0;
655 		for (ci = cs->sc_cinfo;
656 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
657 			if (ci->ci_size >= smallci->ci_size) {
658 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
659 			}
660 		}
661 		ii->ii_ndisk = ix;
662 		bn += ix * (smallci->ci_size - size);
663 		lbn = smallci->ci_size / cs->sc_ileave;
664 		size = smallci->ci_size;
665 	}
666 #ifdef DEBUG
667 	if (ccddebug & CCDB_INIT)
668 		printiinfo(cs->sc_itable);
669 #endif
670 }
671 
672 /* ARGSUSED */
673 static int
674 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
675 {
676 	int unit = ccdunit(dev);
677 	struct ccd_softc *cs;
678 	struct disklabel *lp;
679 	int error = 0, part, pmask;
680 
681 #ifdef DEBUG
682 	if (ccddebug & CCDB_FOLLOW)
683 		printf("ccdopen(%x, %x)\n", dev, flags);
684 #endif
685 	if (unit >= numccd)
686 		return (ENXIO);
687 	cs = &ccd_softc[unit];
688 
689 	if ((error = ccdlock(cs)) != 0)
690 		return (error);
691 
692 	lp = &cs->sc_label;
693 
694 	part = ccdpart(dev);
695 	pmask = (1 << part);
696 
697 	/*
698 	 * If we're initialized, check to see if there are any other
699 	 * open partitions.  If not, then it's safe to update
700 	 * the in-core disklabel.
701 	 */
702 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
703 		ccdgetdisklabel(dev);
704 
705 	/* Check that the partition exists. */
706 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
707 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
708 		error = ENXIO;
709 		goto done;
710 	}
711 
712 	cs->sc_openmask |= pmask;
713  done:
714 	ccdunlock(cs);
715 	return (0);
716 }
717 
718 /* ARGSUSED */
719 static int
720 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
721 {
722 	int unit = ccdunit(dev);
723 	struct ccd_softc *cs;
724 	int error = 0, part;
725 
726 #ifdef DEBUG
727 	if (ccddebug & CCDB_FOLLOW)
728 		printf("ccdclose(%x, %x)\n", dev, flags);
729 #endif
730 
731 	if (unit >= numccd)
732 		return (ENXIO);
733 	cs = &ccd_softc[unit];
734 
735 	if ((error = ccdlock(cs)) != 0)
736 		return (error);
737 
738 	part = ccdpart(dev);
739 
740 	/* ...that much closer to allowing unconfiguration... */
741 	cs->sc_openmask &= ~(1 << part);
742 	ccdunlock(cs);
743 	return (0);
744 }
745 
746 static void
747 ccdstrategy(dev_t dev, struct bio *bio)
748 {
749 	int unit = ccdunit(dev);
750 	struct bio *nbio;
751 	struct buf *bp = bio->bio_buf;
752 	struct ccd_softc *cs = &ccd_softc[unit];
753 	int wlabel;
754 	struct disklabel *lp;
755 
756 #ifdef DEBUG
757 	if (ccddebug & CCDB_FOLLOW)
758 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
759 #endif
760 	if ((cs->sc_flags & CCDF_INITED) == 0) {
761 		bp->b_error = ENXIO;
762 		bp->b_flags |= B_ERROR;
763 		goto done;
764 	}
765 
766 	/* If it's a nil transfer, wake up the top half now. */
767 	if (bp->b_bcount == 0)
768 		goto done;
769 
770 	lp = &cs->sc_label;
771 
772 	/*
773 	 * Do bounds checking and adjust transfer.  If there's an
774 	 * error, the bounds check will flag that for us.
775 	 */
776 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
777 	if (ccdpart(dev) != RAW_PART) {
778 		nbio = bounds_check_with_label(dev, bio, lp, wlabel);
779 		if (nbio == NULL)
780 			goto done;
781 	} else {
782 		int pbn;        /* in sc_secsize chunks */
783 		long sz;        /* in sc_secsize chunks */
784 
785 		pbn = (int)(bio->bio_offset / cs->sc_geom.ccg_secsize);
786 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
787 
788 		/*
789 		 * If out of bounds return an error. If at the EOF point,
790 		 * simply read or write less.
791 		 */
792 
793 		if (pbn < 0 || pbn >= cs->sc_size) {
794 			bp->b_resid = bp->b_bcount;
795 			if (pbn != cs->sc_size) {
796 				bp->b_error = EINVAL;
797 				bp->b_flags |= B_ERROR | B_INVAL;
798 			}
799 			goto done;
800 		}
801 
802 		/*
803 		 * If the request crosses EOF, truncate the request.
804 		 */
805 		if (pbn + sz > cs->sc_size) {
806 			bp->b_bcount = (cs->sc_size - pbn) *
807 			    cs->sc_geom.ccg_secsize;
808 		}
809 		nbio = bio;
810 	}
811 
812 	bp->b_resid = bp->b_bcount;
813 	nbio->bio_driver_info = dev;
814 
815 	/*
816 	 * "Start" the unit.
817 	 */
818 	crit_enter();
819 	ccdstart(cs, nbio);
820 	crit_exit();
821 	return;
822 
823 	/*
824 	 * note: bio, not nbio, is valid at the done label.
825 	 */
826 done:
827 	biodone(bio);
828 }
829 
830 static void
831 ccdstart(struct ccd_softc *cs, struct bio *bio)
832 {
833 	long bcount, rcount;
834 	struct ccdbuf *cbp[4];
835 	struct buf *bp = bio->bio_buf;
836 	dev_t dev = bio->bio_driver_info;
837 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
838 	caddr_t addr;
839 	off_t doffset;
840 	struct partition *pp;
841 
842 #ifdef DEBUG
843 	if (ccddebug & CCDB_FOLLOW)
844 		printf("ccdstart(%x, %x)\n", cs, bp);
845 #endif
846 
847 	/* Record the transaction start  */
848 	devstat_start_transaction(&cs->device_stats);
849 
850 	/*
851 	 * Translate the partition-relative block number to an absolute.
852 	 */
853 	doffset = bio->bio_offset;
854 	if (ccdpart(dev) != RAW_PART) {
855 		pp = &cs->sc_label.d_partitions[ccdpart(dev)];
856 		doffset += pp->p_offset * cs->sc_label.d_secsize;
857 	}
858 
859 	/*
860 	 * Allocate component buffers and fire off the requests
861 	 */
862 	addr = bp->b_data;
863 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
864 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
865 		rcount = cbp[0]->cb_buf.b_bcount;
866 
867 		if (cs->sc_cflags & CCDF_MIRROR) {
868 			/*
869 			 * Mirroring.  Writes go to both disks, reads are
870 			 * taken from whichever disk seems most appropriate.
871 			 *
872 			 * We attempt to localize reads to the disk whos arm
873 			 * is nearest the read request.  We ignore seeks due
874 			 * to writes when making this determination and we
875 			 * also try to avoid hogging.
876 			 */
877 			if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
878 				vn_strategy(cbp[0]->cb_buf.b_vp,
879 				    &cbp[0]->cb_buf.b_bio1);
880 				vn_strategy(cbp[1]->cb_buf.b_vp,
881 				    &cbp[1]->cb_buf.b_bio1);
882 			} else {
883 				int pick = cs->sc_pick;
884 				daddr_t range = cs->sc_size / 16 * cs->sc_label.d_secsize;
885 
886 				if (doffset < cs->sc_blk[pick] - range ||
887 				    doffset > cs->sc_blk[pick] + range
888 				) {
889 					cs->sc_pick = pick = 1 - pick;
890 				}
891 				cs->sc_blk[pick] = doffset + rcount;
892 				vn_strategy(cbp[pick]->cb_buf.b_vp,
893 				    &cbp[pick]->cb_buf.b_bio1);
894 			}
895 		} else {
896 			/*
897 			 * Not mirroring
898 			 */
899 			vn_strategy(cbp[0]->cb_buf.b_vp,
900 				     &cbp[0]->cb_buf.b_bio1);
901 		}
902 		doffset += rcount;
903 		addr += rcount;
904 	}
905 }
906 
907 /*
908  * Build a component buffer header.
909  */
910 static void
911 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
912 	  off_t doffset, caddr_t addr, long bcount)
913 {
914 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
915 	struct ccdbuf *cbp;
916 	daddr_t bn, cbn, cboff;
917 	off_t cbc;
918 
919 #ifdef DEBUG
920 	if (ccddebug & CCDB_IO)
921 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
922 		       cs, bp, bn, addr, bcount);
923 #endif
924 	/*
925 	 * Determine which component bn falls in.
926 	 */
927 	bn = (daddr_t)(doffset / cs->sc_geom.ccg_secsize);
928 	cbn = bn;
929 	cboff = 0;
930 
931 	if (cs->sc_ileave == 0) {
932 		/*
933 		 * Serially concatenated and neither a mirror nor a parity
934 		 * config.  This is a special case.
935 		 */
936 		daddr_t sblk;
937 
938 		sblk = 0;
939 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
940 			sblk += ci->ci_size;
941 		cbn -= sblk;
942 	} else {
943 		struct ccdiinfo *ii;
944 		int ccdisk, off;
945 
946 		/*
947 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
948 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
949 		 * to cbn.
950 		 */
951 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
952 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
953 
954 		/*
955 		 * Figure out which interleave table to use.
956 		 */
957 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
958 			if (ii->ii_startblk > cbn)
959 				break;
960 		}
961 		ii--;
962 
963 		/*
964 		 * off is the logical superblock relative to the beginning
965 		 * of this interleave block.
966 		 */
967 		off = cbn - ii->ii_startblk;
968 
969 		/*
970 		 * We must calculate which disk component to use (ccdisk),
971 		 * and recalculate cbn to be the superblock relative to
972 		 * the beginning of the component.  This is typically done by
973 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
974 		 * must typically be divided by the number of components in
975 		 * this interleave array to be properly convert it from a
976 		 * CCD-relative logical superblock number to a
977 		 * component-relative superblock number.
978 		 */
979 		if (ii->ii_ndisk == 1) {
980 			/*
981 			 * When we have just one disk, it can't be a mirror
982 			 * or a parity config.
983 			 */
984 			ccdisk = ii->ii_index[0];
985 			cbn = ii->ii_startoff + off;
986 		} else {
987 			if (cs->sc_cflags & CCDF_MIRROR) {
988 				/*
989 				 * We have forced a uniform mapping, resulting
990 				 * in a single interleave array.  We double
991 				 * up on the first half of the available
992 				 * components and our mirror is in the second
993 				 * half.  This only works with a single
994 				 * interleave array because doubling up
995 				 * doubles the number of sectors, so there
996 				 * cannot be another interleave array because
997 				 * the next interleave array's calculations
998 				 * would be off.
999 				 */
1000 				int ndisk2 = ii->ii_ndisk / 2;
1001 				ccdisk = ii->ii_index[off % ndisk2];
1002 				cbn = ii->ii_startoff + off / ndisk2;
1003 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1004 			} else if (cs->sc_cflags & CCDF_PARITY) {
1005 				/*
1006 				 * XXX not implemented yet
1007 				 */
1008 				int ndisk2 = ii->ii_ndisk - 1;
1009 				ccdisk = ii->ii_index[off % ndisk2];
1010 				cbn = ii->ii_startoff + off / ndisk2;
1011 				if (cbn % ii->ii_ndisk <= ccdisk)
1012 					ccdisk++;
1013 			} else {
1014 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1015 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1016 			}
1017 		}
1018 
1019 		ci = &cs->sc_cinfo[ccdisk];
1020 
1021 		/*
1022 		 * Convert cbn from a superblock to a normal block so it
1023 		 * can be used to calculate (along with cboff) the normal
1024 		 * block index into this particular disk.
1025 		 */
1026 		cbn *= cs->sc_ileave;
1027 	}
1028 
1029 	/*
1030 	 * Fill in the component buf structure.
1031 	 */
1032 	cbp = getccdbuf();
1033 	cbp->cb_buf.b_flags = bio->bio_buf->b_flags;
1034 	cbp->cb_buf.b_data = addr;
1035 	cbp->cb_buf.b_vp = ci->ci_vp;
1036 	if (cs->sc_ileave == 0)
1037               cbc = dbtob((off_t)(ci->ci_size - cbn));
1038 	else
1039               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1040 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1041  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1042 
1043 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1044 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1045 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1046 
1047 	/*
1048 	 * context for ccdiodone
1049 	 */
1050 	cbp->cb_obio = bio;
1051 	cbp->cb_unit = cs - ccd_softc;
1052 	cbp->cb_comp = ci - cs->sc_cinfo;
1053 
1054 #ifdef DEBUG
1055 	if (ccddebug & CCDB_IO)
1056 		printf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1057 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1058 		       cbp->cb_buf.b_bio1.bio_offset,
1059 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1060 #endif
1061 	cb[0] = cbp;
1062 
1063 	/*
1064 	 * Note: both I/O's setup when reading from mirror, but only one
1065 	 * will be executed.
1066 	 */
1067 	if (cs->sc_cflags & CCDF_MIRROR) {
1068 		/* mirror, setup second I/O */
1069 		cbp = getccdbuf();
1070 
1071 		cbp->cb_buf.b_flags = bio->bio_buf->b_flags;
1072 		cbp->cb_buf.b_data = addr;
1073 		cbp->cb_buf.b_vp = ci2->ci_vp;
1074 		if (cs->sc_ileave == 0)
1075 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1076 		else
1077 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1078 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1079 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1080 
1081 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1082 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1083 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1084 
1085 		/*
1086 		 * context for ccdiodone
1087 		 */
1088 		cbp->cb_obio = bio;
1089 		cbp->cb_unit = cs - ccd_softc;
1090 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1091 		cb[1] = cbp;
1092 		/* link together the ccdbuf's and clear "mirror done" flag */
1093 		cb[0]->cb_mirror = cb[1];
1094 		cb[1]->cb_mirror = cb[0];
1095 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1096 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 	}
1098 }
1099 
1100 static void
1101 ccdintr(struct ccd_softc *cs, struct bio *bio)
1102 {
1103 	struct buf *bp = bio->bio_buf;
1104 
1105 #ifdef DEBUG
1106 	if (ccddebug & CCDB_FOLLOW)
1107 		printf("ccdintr(%x, %x)\n", cs, bp);
1108 #endif
1109 	/*
1110 	 * Request is done for better or worse, wakeup the top half.
1111 	 */
1112 	if (bp->b_flags & B_ERROR)
1113 		bp->b_resid = bp->b_bcount;
1114 	devstat_end_transaction_buf(&cs->device_stats, bp);
1115 	biodone(bio);
1116 }
1117 
1118 /*
1119  * Called at interrupt time.
1120  * Mark the component as done and if all components are done,
1121  * take a ccd interrupt.
1122  */
1123 static void
1124 ccdiodone(struct bio *bio)
1125 {
1126 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1127 	struct bio *obio = cbp->cb_obio;
1128 	struct buf *obp = obio->bio_buf;
1129 	int unit = cbp->cb_unit;
1130 	int count;
1131 
1132 	/*
1133 	 * Since we do not have exclusive access to underlying devices,
1134 	 * we can't keep cache translations around.
1135 	 */
1136 	clearbiocache(bio->bio_next);
1137 
1138 	crit_enter();
1139 #ifdef DEBUG
1140 	if (ccddebug & CCDB_FOLLOW)
1141 		printf("ccdiodone(%x)\n", cbp);
1142 	if (ccddebug & CCDB_IO) {
1143 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1144 		       obp, obp->b_bcount, obp->b_resid);
1145 		printf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1146 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1147 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1148 		       cbp->cb_buf.b_bcount);
1149 	}
1150 #endif
1151 	/*
1152 	 * If an error occured, report it.  If this is a mirrored
1153 	 * configuration and the first of two possible reads, do not
1154 	 * set the error in the bp yet because the second read may
1155 	 * succeed.
1156 	 */
1157 	if (cbp->cb_buf.b_flags & B_ERROR) {
1158 		const char *msg = "";
1159 
1160 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1161 		    (cbp->cb_buf.b_flags & B_READ) &&
1162 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1163 			/*
1164 			 * We will try our read on the other disk down
1165 			 * below, also reverse the default pick so if we
1166 			 * are doing a scan we do not keep hitting the
1167 			 * bad disk first.
1168 			 */
1169 			struct ccd_softc *cs = &ccd_softc[unit];
1170 
1171 			msg = ", trying other disk";
1172 			cs->sc_pick = 1 - cs->sc_pick;
1173 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1174 		} else {
1175 			obp->b_flags |= B_ERROR;
1176 			obp->b_error = cbp->cb_buf.b_error ?
1177 			    cbp->cb_buf.b_error : EIO;
1178 		}
1179 		printf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1180 		       unit, obp->b_error, cbp->cb_comp,
1181 		       cbp->cb_buf.b_bio2.bio_offset,
1182 		       obio->bio_offset, msg);
1183 	}
1184 
1185 	/*
1186 	 * Process mirror.  If we are writing, I/O has been initiated on both
1187 	 * buffers and we fall through only after both are finished.
1188 	 *
1189 	 * If we are reading only one I/O is initiated at a time.  If an
1190 	 * error occurs we initiate the second I/O and return, otherwise
1191 	 * we free the second I/O without initiating it.
1192 	 */
1193 
1194 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1195 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1196 			/*
1197 			 * When writing, handshake with the second buffer
1198 			 * to determine when both are done.  If both are not
1199 			 * done, return here.
1200 			 */
1201 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1203 				putccdbuf(cbp);
1204 				crit_exit();
1205 				return;
1206 			}
1207 		} else {
1208 			/*
1209 			 * When reading, either dispose of the second buffer
1210 			 * or initiate I/O on the second buffer if an error
1211 			 * occured with this one.
1212 			 */
1213 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1214 				if (cbp->cb_buf.b_flags & B_ERROR) {
1215 					cbp->cb_mirror->cb_pflags |=
1216 					    CCDPF_MIRROR_DONE;
1217 					vn_strategy(
1218 					    cbp->cb_mirror->cb_buf.b_vp,
1219 					    &cbp->cb_mirror->cb_buf.b_bio1
1220 					);
1221 					putccdbuf(cbp);
1222 					crit_exit();
1223 					return;
1224 				} else {
1225 					putccdbuf(cbp->cb_mirror);
1226 					/* fall through */
1227 				}
1228 			}
1229 		}
1230 	}
1231 
1232 	/*
1233 	 * use b_bufsize to determine how big the original request was rather
1234 	 * then b_bcount, because b_bcount may have been truncated for EOF.
1235 	 *
1236 	 * XXX We check for an error, but we do not test the resid for an
1237 	 * aligned EOF condition.  This may result in character & block
1238 	 * device access not recognizing EOF properly when read or written
1239 	 * sequentially, but will not effect filesystems.
1240 	 */
1241 	count = cbp->cb_buf.b_bufsize;
1242 	putccdbuf(cbp);
1243 
1244 	/*
1245 	 * If all done, "interrupt".
1246 	 */
1247 	obp->b_resid -= count;
1248 	if (obp->b_resid < 0)
1249 		panic("ccdiodone: count");
1250 	if (obp->b_resid == 0)
1251 		ccdintr(&ccd_softc[unit], obio);
1252 	crit_exit();
1253 }
1254 
1255 static int
1256 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1257 {
1258 	int unit = ccdunit(dev);
1259 	int i, j, lookedup = 0, error = 0;
1260 	int part, pmask;
1261 	struct ccd_softc *cs;
1262 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1263 	struct ccddevice ccd;
1264 	char **cpp;
1265 	struct vnode **vpp;
1266 	struct ucred *cred;
1267 
1268 	KKASSERT(td->td_proc != NULL);
1269 	cred = td->td_proc->p_ucred;
1270 
1271 	if (unit >= numccd)
1272 		return (ENXIO);
1273 	cs = &ccd_softc[unit];
1274 
1275 	bzero(&ccd, sizeof(ccd));
1276 
1277 	switch (cmd) {
1278 	case CCDIOCSET:
1279 		if (cs->sc_flags & CCDF_INITED)
1280 			return (EBUSY);
1281 
1282 		if ((flag & FWRITE) == 0)
1283 			return (EBADF);
1284 
1285 		if ((error = ccdlock(cs)) != 0)
1286 			return (error);
1287 
1288 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1289 			return (EINVAL);
1290 
1291 		/* Fill in some important bits. */
1292 		ccd.ccd_unit = unit;
1293 		ccd.ccd_interleave = ccio->ccio_ileave;
1294 		if (ccd.ccd_interleave == 0 &&
1295 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1296 		     (ccio->ccio_flags & CCDF_PARITY))) {
1297 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1298 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1299 		}
1300 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1301 		    (ccio->ccio_flags & CCDF_PARITY)) {
1302 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1303 			ccio->ccio_flags &= ~CCDF_PARITY;
1304 		}
1305 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1306 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1307 			printf("ccd%d: mirror/parity forces uniform flag\n",
1308 			       unit);
1309 			ccio->ccio_flags |= CCDF_UNIFORM;
1310 		}
1311 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1312 
1313 		/*
1314 		 * Allocate space for and copy in the array of
1315 		 * componet pathnames and device numbers.
1316 		 */
1317 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1318 		    M_DEVBUF, M_WAITOK);
1319 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1320 		    M_DEVBUF, M_WAITOK);
1321 
1322 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1323 		    ccio->ccio_ndisks * sizeof(char **));
1324 		if (error) {
1325 			free(vpp, M_DEVBUF);
1326 			free(cpp, M_DEVBUF);
1327 			ccdunlock(cs);
1328 			return (error);
1329 		}
1330 
1331 #ifdef DEBUG
1332 		if (ccddebug & CCDB_INIT)
1333 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1334 				printf("ccdioctl: component %d: 0x%x\n",
1335 				    i, cpp[i]);
1336 #endif
1337 
1338 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1339 #ifdef DEBUG
1340 			if (ccddebug & CCDB_INIT)
1341 				printf("ccdioctl: lookedup = %d\n", lookedup);
1342 #endif
1343 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1344 				for (j = 0; j < lookedup; ++j)
1345 					(void)vn_close(vpp[j], FREAD|FWRITE, td);
1346 				free(vpp, M_DEVBUF);
1347 				free(cpp, M_DEVBUF);
1348 				ccdunlock(cs);
1349 				return (error);
1350 			}
1351 			++lookedup;
1352 		}
1353 		ccd.ccd_cpp = cpp;
1354 		ccd.ccd_vpp = vpp;
1355 		ccd.ccd_ndev = ccio->ccio_ndisks;
1356 
1357 		/*
1358 		 * Initialize the ccd.  Fills in the softc for us.
1359 		 */
1360 		if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1361 			for (j = 0; j < lookedup; ++j)
1362 				(void)vn_close(vpp[j], FREAD|FWRITE, td);
1363 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1364 			free(vpp, M_DEVBUF);
1365 			free(cpp, M_DEVBUF);
1366 			ccdunlock(cs);
1367 			return (error);
1368 		}
1369 
1370 		/*
1371 		 * The ccd has been successfully initialized, so
1372 		 * we can place it into the array and read the disklabel.
1373 		 */
1374 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1375 		ccio->ccio_unit = unit;
1376 		ccio->ccio_size = cs->sc_size;
1377 		ccdgetdisklabel(dev);
1378 
1379 		ccdunlock(cs);
1380 
1381 		break;
1382 
1383 	case CCDIOCCLR:
1384 		if ((cs->sc_flags & CCDF_INITED) == 0)
1385 			return (ENXIO);
1386 
1387 		if ((flag & FWRITE) == 0)
1388 			return (EBADF);
1389 
1390 		if ((error = ccdlock(cs)) != 0)
1391 			return (error);
1392 
1393 		/* Don't unconfigure if any other partitions are open */
1394 		part = ccdpart(dev);
1395 		pmask = (1 << part);
1396 		if ((cs->sc_openmask & ~pmask)) {
1397 			ccdunlock(cs);
1398 			return (EBUSY);
1399 		}
1400 
1401 		/*
1402 		 * Free ccd_softc information and clear entry.
1403 		 */
1404 
1405 		/* Close the components and free their pathnames. */
1406 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1407 			/*
1408 			 * XXX: this close could potentially fail and
1409 			 * cause Bad Things.  Maybe we need to force
1410 			 * the close to happen?
1411 			 */
1412 #ifdef DEBUG
1413 			if (ccddebug & CCDB_VNODE)
1414 				vprint("CCDIOCCLR: vnode info",
1415 				    cs->sc_cinfo[i].ci_vp);
1416 #endif
1417 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1418 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1419 		}
1420 
1421 		/* Free interleave index. */
1422 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1423 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1424 
1425 		/* Free component info and interleave table. */
1426 		free(cs->sc_cinfo, M_DEVBUF);
1427 		free(cs->sc_itable, M_DEVBUF);
1428 		cs->sc_flags &= ~CCDF_INITED;
1429 
1430 		/*
1431 		 * Free ccddevice information and clear entry.
1432 		 */
1433 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1434 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1435 		ccd.ccd_dk = -1;
1436 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1437 
1438 		/*
1439 		 * And remove the devstat entry.
1440 		 */
1441 		devstat_remove_entry(&cs->device_stats);
1442 
1443 		/* This must be atomic. */
1444 		crit_enter();
1445 		ccdunlock(cs);
1446 		bzero(cs, sizeof(struct ccd_softc));
1447 		crit_exit();
1448 
1449 		break;
1450 
1451 	case DIOCGDINFO:
1452 		if ((cs->sc_flags & CCDF_INITED) == 0)
1453 			return (ENXIO);
1454 
1455 		*(struct disklabel *)data = cs->sc_label;
1456 		break;
1457 
1458 	case DIOCGPART:
1459 		if ((cs->sc_flags & CCDF_INITED) == 0)
1460 			return (ENXIO);
1461 
1462 		((struct partinfo *)data)->disklab = &cs->sc_label;
1463 		((struct partinfo *)data)->part =
1464 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1465 		break;
1466 
1467 	case DIOCWDINFO:
1468 	case DIOCSDINFO:
1469 		if ((cs->sc_flags & CCDF_INITED) == 0)
1470 			return (ENXIO);
1471 
1472 		if ((flag & FWRITE) == 0)
1473 			return (EBADF);
1474 
1475 		if ((error = ccdlock(cs)) != 0)
1476 			return (error);
1477 
1478 		cs->sc_flags |= CCDF_LABELLING;
1479 
1480 		error = setdisklabel(&cs->sc_label,
1481 		    (struct disklabel *)data, 0);
1482 		if (error == 0) {
1483 			if (cmd == DIOCWDINFO) {
1484 				dev_t cdev = CCDLABELDEV(dev);
1485 				error = writedisklabel(cdev, &cs->sc_label);
1486 			}
1487 		}
1488 
1489 		cs->sc_flags &= ~CCDF_LABELLING;
1490 
1491 		ccdunlock(cs);
1492 
1493 		if (error)
1494 			return (error);
1495 		break;
1496 
1497 	case DIOCWLABEL:
1498 		if ((cs->sc_flags & CCDF_INITED) == 0)
1499 			return (ENXIO);
1500 
1501 		if ((flag & FWRITE) == 0)
1502 			return (EBADF);
1503 		if (*(int *)data != 0)
1504 			cs->sc_flags |= CCDF_WLABEL;
1505 		else
1506 			cs->sc_flags &= ~CCDF_WLABEL;
1507 		break;
1508 
1509 	default:
1510 		return (ENOTTY);
1511 	}
1512 
1513 	return (0);
1514 }
1515 
1516 static int
1517 ccdsize(dev_t dev)
1518 {
1519 	struct ccd_softc *cs;
1520 	int part, size;
1521 
1522 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1523 		return (-1);
1524 
1525 	cs = &ccd_softc[ccdunit(dev)];
1526 	part = ccdpart(dev);
1527 
1528 	if ((cs->sc_flags & CCDF_INITED) == 0)
1529 		return (-1);
1530 
1531 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1532 		size = -1;
1533 	else
1534 		size = cs->sc_label.d_partitions[part].p_size;
1535 
1536 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1537 		return (-1);
1538 
1539 	return (size);
1540 }
1541 
1542 static int
1543 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1544 {
1545 	/* Not implemented. */
1546 	return ENXIO;
1547 }
1548 
1549 /*
1550  * Lookup the provided name in the filesystem.  If the file exists,
1551  * is a valid block device, and isn't being used by anyone else,
1552  * set *vpp to the file's vnode.
1553  */
1554 static int
1555 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1556 {
1557 	struct nlookupdata nd;
1558 	struct ucred *cred;
1559 	struct vnode *vp;
1560 	int error;
1561 
1562 	KKASSERT(td->td_proc);
1563 	cred = td->td_proc->p_ucred;
1564 	*vpp = NULL;
1565 
1566 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1567 	if (error)
1568 		return (error);
1569 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1570 #ifdef DEBUG
1571 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1572 			printf("ccdlookup: vn_open error = %d\n", error);
1573 #endif
1574 		goto done;
1575 	}
1576 	vp = nd.nl_open_vp;
1577 
1578 	if (vp->v_usecount > 1) {
1579 		error = EBUSY;
1580 		goto done;
1581 	}
1582 
1583 	if (!vn_isdisk(vp, &error))
1584 		goto done;
1585 
1586 #ifdef DEBUG
1587 	if (ccddebug & CCDB_VNODE)
1588 		vprint("ccdlookup: vnode info", vp);
1589 #endif
1590 
1591 	VOP_UNLOCK(vp, 0, td);
1592 	nd.nl_open_vp = NULL;
1593 	nlookup_done(&nd);
1594 	*vpp = vp;				/* leave ref intact  */
1595 	return (0);
1596 done:
1597 	nlookup_done(&nd);
1598 	return (error);
1599 }
1600 
1601 /*
1602  * Read the disklabel from the ccd.  If one is not present, fake one
1603  * up.
1604  */
1605 static void
1606 ccdgetdisklabel(dev_t dev)
1607 {
1608 	int unit = ccdunit(dev);
1609 	struct ccd_softc *cs = &ccd_softc[unit];
1610 	char *errstring;
1611 	struct disklabel *lp = &cs->sc_label;
1612 	struct ccdgeom *ccg = &cs->sc_geom;
1613 	dev_t cdev;
1614 
1615 	bzero(lp, sizeof(*lp));
1616 
1617 	lp->d_secperunit = cs->sc_size;
1618 	lp->d_secsize = ccg->ccg_secsize;
1619 	lp->d_nsectors = ccg->ccg_nsectors;
1620 	lp->d_ntracks = ccg->ccg_ntracks;
1621 	lp->d_ncylinders = ccg->ccg_ncylinders;
1622 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1623 
1624 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1625 	lp->d_type = DTYPE_CCD;
1626 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1627 	lp->d_rpm = 3600;
1628 	lp->d_interleave = 1;
1629 	lp->d_flags = 0;
1630 
1631 	lp->d_partitions[RAW_PART].p_offset = 0;
1632 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1633 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1634 	lp->d_npartitions = RAW_PART + 1;
1635 
1636 	lp->d_bbsize = BBSIZE;				/* XXX */
1637 	lp->d_sbsize = SBSIZE;				/* XXX */
1638 
1639 	lp->d_magic = DISKMAGIC;
1640 	lp->d_magic2 = DISKMAGIC;
1641 	lp->d_checksum = dkcksum(&cs->sc_label);
1642 
1643 	/*
1644 	 * Call the generic disklabel extraction routine.
1645 	 */
1646 	cdev = CCDLABELDEV(dev);
1647 	errstring = readdisklabel(cdev, &cs->sc_label);
1648 	if (errstring != NULL)
1649 		ccdmakedisklabel(cs);
1650 
1651 #ifdef DEBUG
1652 	/* It's actually extremely common to have unlabeled ccds. */
1653 	if (ccddebug & CCDB_LABEL)
1654 		if (errstring != NULL)
1655 			printf("ccd%d: %s\n", unit, errstring);
1656 #endif
1657 }
1658 
1659 /*
1660  * Take care of things one might want to take care of in the event
1661  * that a disklabel isn't present.
1662  */
1663 static void
1664 ccdmakedisklabel(struct ccd_softc *cs)
1665 {
1666 	struct disklabel *lp = &cs->sc_label;
1667 
1668 	/*
1669 	 * For historical reasons, if there's no disklabel present
1670 	 * the raw partition must be marked FS_BSDFFS.
1671 	 */
1672 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1673 
1674 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1675 }
1676 
1677 /*
1678  * Wait interruptibly for an exclusive lock.
1679  *
1680  * XXX
1681  * Several drivers do this; it should be abstracted and made MP-safe.
1682  */
1683 static int
1684 ccdlock(struct ccd_softc *cs)
1685 {
1686 	int error;
1687 
1688 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689 		cs->sc_flags |= CCDF_WANTED;
1690 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1691 			return (error);
1692 	}
1693 	cs->sc_flags |= CCDF_LOCKED;
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Unlock and wake up any waiters.
1699  */
1700 static void
1701 ccdunlock(struct ccd_softc *cs)
1702 {
1703 
1704 	cs->sc_flags &= ~CCDF_LOCKED;
1705 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1706 		cs->sc_flags &= ~CCDF_WANTED;
1707 		wakeup(cs);
1708 	}
1709 }
1710 
1711 #ifdef DEBUG
1712 static void
1713 printiinfo(struct ccdiinfo *ii)
1714 {
1715 	int ix, i;
1716 
1717 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1718 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1719 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1720 		for (i = 0; i < ii->ii_ndisk; i++)
1721 			printf(" %d", ii->ii_index[i]);
1722 		printf("\n");
1723 	}
1724 }
1725 #endif
1726 
1727 
1728 /* Local Variables: */
1729 /* c-argdecl-indent: 8 */
1730 /* c-continued-statement-offset: 8 */
1731 /* c-indent-level: 8 */
1732 /* End: */
1733