xref: /netbsd-src/sys/dev/ccd.c (revision f14316bcbc544b96a93e884bc5c2b15fd60e22ae)
1 /*	$NetBSD: ccd.c,v 1.151 2014/07/25 08:10:35 dholland Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe, and by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * "Concatenated" disk driver.
72  *
73  * Notes on concurrency:
74  *
75  * => sc_dvlock serializes access to the device nodes, excluding block I/O.
76  *
77  * => sc_iolock serializes access to (sc_flags & CCDF_INITED), disk stats,
78  *    sc_stop, sc_bufq and b_resid from master buffers.
79  *
80  * => a combination of CCDF_INITED, sc_inflight, and sc_iolock is used to
81  *    serialize I/O and configuration changes.
82  *
83  * => the in-core disk label does not change while the device is open.
84  *
85  * On memory consumption: ccd fans out I/O requests and so needs to
86  * allocate memory.  If the system is desperately low on memory, we
87  * single thread I/O.
88  */
89 
90 #include <sys/cdefs.h>
91 __KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.151 2014/07/25 08:10:35 dholland Exp $");
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/proc.h>
97 #include <sys/errno.h>
98 #include <sys/buf.h>
99 #include <sys/kmem.h>
100 #include <sys/pool.h>
101 #include <sys/module.h>
102 #include <sys/namei.h>
103 #include <sys/stat.h>
104 #include <sys/ioctl.h>
105 #include <sys/disklabel.h>
106 #include <sys/device.h>
107 #include <sys/disk.h>
108 #include <sys/syslog.h>
109 #include <sys/fcntl.h>
110 #include <sys/vnode.h>
111 #include <sys/conf.h>
112 #include <sys/mutex.h>
113 #include <sys/queue.h>
114 #include <sys/kauth.h>
115 #include <sys/kthread.h>
116 #include <sys/bufq.h>
117 #include <sys/sysctl.h>
118 
119 #include <uvm/uvm_extern.h>
120 
121 #include <dev/ccdvar.h>
122 #include <dev/dkvar.h>
123 
124 #include <miscfs/specfs/specdev.h> /* for v_rdev */
125 
126 #if defined(CCDDEBUG) && !defined(DEBUG)
127 #define DEBUG
128 #endif
129 
130 #ifdef DEBUG
131 #define CCDB_FOLLOW	0x01
132 #define CCDB_INIT	0x02
133 #define CCDB_IO		0x04
134 #define CCDB_LABEL	0x08
135 #define CCDB_VNODE	0x10
136 int ccddebug = 0x00;
137 #endif
138 
139 #define	ccdunit(x)	DISKUNIT(x)
140 
141 struct ccdbuf {
142 	struct buf	cb_buf;		/* new I/O buf */
143 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
144 	struct ccd_softc *cb_sc;	/* pointer to ccd softc */
145 	int		cb_comp;	/* target component */
146 	SIMPLEQ_ENTRY(ccdbuf) cb_q;	/* fifo of component buffers */
147 };
148 
149 /* component buffer pool */
150 static pool_cache_t ccd_cache;
151 
152 #define	CCD_GETBUF()		pool_cache_get(ccd_cache, PR_WAITOK)
153 #define	CCD_PUTBUF(cbp)		pool_cache_put(ccd_cache, cbp)
154 
155 #define CCDLABELDEV(dev)	\
156 	(MAKEDISKDEV(major((dev)), ccdunit((dev)), RAW_PART))
157 
158 /* called by main() at boot time */
159 void	ccdattach(int);
160 
161 /* called by biodone() at interrupt time */
162 static void	ccdiodone(struct buf *);
163 
164 static void	ccdinterleave(struct ccd_softc *);
165 static int	ccdinit(struct ccd_softc *, char **, struct vnode **,
166 		    struct lwp *);
167 static struct ccdbuf *ccdbuffer(struct ccd_softc *, struct buf *,
168 		    daddr_t, void *, long);
169 static void	ccdgetdefaultlabel(struct ccd_softc *, struct disklabel *);
170 static void	ccdgetdisklabel(dev_t);
171 static void	ccdmakedisklabel(struct ccd_softc *);
172 static void	ccdstart(struct ccd_softc *);
173 static void	ccdthread(void *);
174 
175 static dev_type_open(ccdopen);
176 static dev_type_close(ccdclose);
177 static dev_type_read(ccdread);
178 static dev_type_write(ccdwrite);
179 static dev_type_ioctl(ccdioctl);
180 static dev_type_strategy(ccdstrategy);
181 static dev_type_size(ccdsize);
182 
183 const struct bdevsw ccd_bdevsw = {
184 	.d_open = ccdopen,
185 	.d_close = ccdclose,
186 	.d_strategy = ccdstrategy,
187 	.d_ioctl = ccdioctl,
188 	.d_dump = nodump,
189 	.d_psize = ccdsize,
190 	.d_discard = nodiscard,
191 	.d_flag = D_DISK | D_MPSAFE
192 };
193 
194 const struct cdevsw ccd_cdevsw = {
195 	.d_open = ccdopen,
196 	.d_close = ccdclose,
197 	.d_read = ccdread,
198 	.d_write = ccdwrite,
199 	.d_ioctl = ccdioctl,
200 	.d_stop = nostop,
201 	.d_tty = notty,
202 	.d_poll = nopoll,
203 	.d_mmap = nommap,
204 	.d_kqfilter = nokqfilter,
205 	.d_discard = nodiscard,
206 	.d_flag = D_DISK | D_MPSAFE
207 };
208 
209 #ifdef DEBUG
210 static	void printiinfo(struct ccdiinfo *);
211 #endif
212 
213 static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds);
214 static kmutex_t ccd_lock;
215 
216 static struct ccd_softc *
217 ccdcreate(int unit) {
218 	struct ccd_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
219 	if (sc == NULL) {
220 #ifdef DIAGNOSTIC
221 		printf("%s: out of memory\n", __func__);
222 #endif
223 		return NULL;
224 	}
225 	/* Initialize per-softc structures. */
226 	snprintf(sc->sc_xname, sizeof(sc->sc_xname), "ccd%d", unit);
227 	mutex_init(&sc->sc_dvlock, MUTEX_DEFAULT, IPL_NONE);
228 	sc->sc_iolock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
229 	cv_init(&sc->sc_stop, "ccdstop");
230 	cv_init(&sc->sc_push, "ccdthr");
231 	disk_init(&sc->sc_dkdev, sc->sc_xname, NULL); /* XXX */
232 	return sc;
233 }
234 
235 static void
236 ccddestroy(struct ccd_softc *sc) {
237 	mutex_obj_free(sc->sc_iolock);
238 	mutex_exit(&sc->sc_dvlock);
239 	mutex_destroy(&sc->sc_dvlock);
240 	cv_destroy(&sc->sc_stop);
241 	cv_destroy(&sc->sc_push);
242 	disk_destroy(&sc->sc_dkdev);
243 	kmem_free(sc, sizeof(*sc));
244 }
245 
246 static struct ccd_softc *
247 ccdget(int unit) {
248 	struct ccd_softc *sc;
249 	if (unit < 0) {
250 #ifdef DIAGNOSTIC
251 		panic("%s: unit %d!", __func__, unit);
252 #endif
253 		return NULL;
254 	}
255 	mutex_enter(&ccd_lock);
256 	LIST_FOREACH(sc, &ccds, sc_link) {
257 		if (sc->sc_unit == unit) {
258 			mutex_exit(&ccd_lock);
259 			return sc;
260 		}
261 	}
262 	mutex_exit(&ccd_lock);
263 	if ((sc = ccdcreate(unit)) == NULL)
264 		return NULL;
265 	mutex_enter(&ccd_lock);
266 	LIST_INSERT_HEAD(&ccds, sc, sc_link);
267 	mutex_exit(&ccd_lock);
268 	return sc;
269 }
270 
271 static void
272 ccdput(struct ccd_softc *sc) {
273 	mutex_enter(&ccd_lock);
274 	LIST_REMOVE(sc, sc_link);
275 	mutex_exit(&ccd_lock);
276 	ccddestroy(sc);
277 }
278 
279 /*
280  * Called by main() during pseudo-device attachment.  All we need
281  * to do is allocate enough space for devices to be configured later.
282  */
283 void
284 ccdattach(int num)
285 {
286 	mutex_init(&ccd_lock, MUTEX_DEFAULT, IPL_NONE);
287 
288 	/* Initialize the component buffer pool. */
289 	ccd_cache = pool_cache_init(sizeof(struct ccdbuf), 0,
290 	    0, 0, "ccdbuf", NULL, IPL_BIO, NULL, NULL, NULL);
291 }
292 
293 static int
294 ccdinit(struct ccd_softc *cs, char **cpaths, struct vnode **vpp,
295     struct lwp *l)
296 {
297 	struct ccdcinfo *ci = NULL;
298 	int ix;
299 	struct ccdgeom *ccg = &cs->sc_geom;
300 	char *tmppath;
301 	int error, path_alloced;
302 	uint64_t psize, minsize;
303 	unsigned secsize, maxsecsize;
304 
305 #ifdef DEBUG
306 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
307 		printf("%s: ccdinit\n", cs->sc_xname);
308 #endif
309 
310 	/* Allocate space for the component info. */
311 	cs->sc_cinfo = kmem_alloc(cs->sc_nccdisks * sizeof(*cs->sc_cinfo),
312 	    KM_SLEEP);
313 	tmppath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
314 
315 	cs->sc_size = 0;
316 
317 	/*
318 	 * Verify that each component piece exists and record
319 	 * relevant information about it.
320 	 */
321 	maxsecsize = 0;
322 	minsize = 0;
323 	for (ix = 0, path_alloced = 0; ix < cs->sc_nccdisks; ix++) {
324 		ci = &cs->sc_cinfo[ix];
325 		ci->ci_vp = vpp[ix];
326 
327 		/*
328 		 * Copy in the pathname of the component.
329 		 */
330 		memset(tmppath, 0, MAXPATHLEN);	/* sanity */
331 		error = copyinstr(cpaths[ix], tmppath,
332 		    MAXPATHLEN, &ci->ci_pathlen);
333 		if (ci->ci_pathlen == 0)
334 			error = EINVAL;
335 		if (error) {
336 #ifdef DEBUG
337 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
338 				printf("%s: can't copy path, error = %d\n",
339 				    cs->sc_xname, error);
340 #endif
341 			goto out;
342 		}
343 		ci->ci_path = kmem_alloc(ci->ci_pathlen, KM_SLEEP);
344 		memcpy(ci->ci_path, tmppath, ci->ci_pathlen);
345 		path_alloced++;
346 
347 		/*
348 		 * XXX: Cache the component's dev_t.
349 		 */
350 		ci->ci_dev = vpp[ix]->v_rdev;
351 
352 		/*
353 		 * Get partition information for the component.
354 		 */
355 		error = getdisksize(vpp[ix], &psize, &secsize);
356 		if (error) {
357 #ifdef DEBUG
358 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
359 				 printf("%s: %s: disksize failed, error = %d\n",
360 				     cs->sc_xname, ci->ci_path, error);
361 #endif
362 			goto out;
363 		}
364 
365 		/*
366 		 * Calculate the size, truncating to an interleave
367 		 * boundary if necessary.
368 		 */
369 		maxsecsize = secsize > maxsecsize ? secsize : maxsecsize;
370 		if (cs->sc_ileave > 1)
371 			psize -= psize % cs->sc_ileave;
372 
373 		if (psize == 0) {
374 #ifdef DEBUG
375 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
376 				printf("%s: %s: size == 0\n",
377 				    cs->sc_xname, ci->ci_path);
378 #endif
379 			error = ENODEV;
380 			goto out;
381 		}
382 
383 		if (minsize == 0 || psize < minsize)
384 			minsize = psize;
385 		ci->ci_size = psize;
386 		cs->sc_size += psize;
387 	}
388 
389 	/*
390 	 * Don't allow the interleave to be smaller than
391 	 * the biggest component sector.
392 	 */
393 	if ((cs->sc_ileave > 0) &&
394 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
395 #ifdef DEBUG
396 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
397 			printf("%s: interleave must be at least %d\n",
398 			    cs->sc_xname, (maxsecsize / DEV_BSIZE));
399 #endif
400 		error = EINVAL;
401 		goto out;
402 	}
403 
404 	/*
405 	 * If uniform interleave is desired set all sizes to that of
406 	 * the smallest component.
407 	 */
408 	if (cs->sc_flags & CCDF_UNIFORM) {
409 		for (ci = cs->sc_cinfo;
410 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
411 			ci->ci_size = minsize;
412 
413 		cs->sc_size = cs->sc_nccdisks * minsize;
414 	}
415 
416 	/*
417 	 * Construct the interleave table.
418 	 */
419 	ccdinterleave(cs);
420 
421 	/*
422 	 * Create pseudo-geometry based on 1MB cylinders.  It's
423 	 * pretty close.
424 	 */
425 	ccg->ccg_secsize = DEV_BSIZE;
426 	ccg->ccg_ntracks = 1;
427 	ccg->ccg_nsectors = 1024 * (1024 / ccg->ccg_secsize);
428 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
429 
430 	/*
431 	 * Create thread to handle deferred I/O.
432 	 */
433 	cs->sc_zap = false;
434 	error = kthread_create(PRI_BIO, KTHREAD_MPSAFE, NULL, ccdthread,
435 	    cs, &cs->sc_thread, "%s", cs->sc_xname);
436 	if (error) {
437 		printf("ccdinit: can't create thread: %d\n", error);
438 		goto out;
439 	}
440 
441 	/*
442 	 * Only now that everything is set up can we enable the device.
443 	 */
444 	mutex_enter(cs->sc_iolock);
445 	cs->sc_flags |= CCDF_INITED;
446 	mutex_exit(cs->sc_iolock);
447 	kmem_free(tmppath, MAXPATHLEN);
448 	return (0);
449 
450  out:
451 	for (ix = 0; ix < path_alloced; ix++) {
452 		kmem_free(cs->sc_cinfo[ix].ci_path,
453 		    cs->sc_cinfo[ix].ci_pathlen);
454 	}
455 	kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo));
456 	kmem_free(tmppath, MAXPATHLEN);
457 	return (error);
458 }
459 
460 static void
461 ccdinterleave(struct ccd_softc *cs)
462 {
463 	struct ccdcinfo *ci, *smallci;
464 	struct ccdiinfo *ii;
465 	daddr_t bn, lbn;
466 	int ix;
467 	u_long size;
468 
469 #ifdef DEBUG
470 	if (ccddebug & CCDB_INIT)
471 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
472 #endif
473 	/*
474 	 * Allocate an interleave table.
475 	 * Chances are this is too big, but we don't care.
476 	 */
477 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
478 	cs->sc_itable = kmem_zalloc(size, KM_SLEEP);
479 
480 	/*
481 	 * Trivial case: no interleave (actually interleave of disk size).
482 	 * Each table entry represents a single component in its entirety.
483 	 */
484 	if (cs->sc_ileave == 0) {
485 		bn = 0;
486 		ii = cs->sc_itable;
487 
488 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
489 			/* Allocate space for ii_index. */
490 			ii->ii_indexsz = sizeof(int);
491 			ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
492 			ii->ii_ndisk = 1;
493 			ii->ii_startblk = bn;
494 			ii->ii_startoff = 0;
495 			ii->ii_index[0] = ix;
496 			bn += cs->sc_cinfo[ix].ci_size;
497 			ii++;
498 		}
499 		ii->ii_ndisk = 0;
500 #ifdef DEBUG
501 		if (ccddebug & CCDB_INIT)
502 			printiinfo(cs->sc_itable);
503 #endif
504 		return;
505 	}
506 
507 	/*
508 	 * The following isn't fast or pretty; it doesn't have to be.
509 	 */
510 	size = 0;
511 	bn = lbn = 0;
512 	for (ii = cs->sc_itable; ; ii++) {
513 		/* Allocate space for ii_index. */
514 		ii->ii_indexsz = sizeof(int) * cs->sc_nccdisks;
515 		ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
516 
517 		/*
518 		 * Locate the smallest of the remaining components
519 		 */
520 		smallci = NULL;
521 		for (ci = cs->sc_cinfo;
522 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
523 			if (ci->ci_size > size &&
524 			    (smallci == NULL ||
525 			     ci->ci_size < smallci->ci_size))
526 				smallci = ci;
527 
528 		/*
529 		 * Nobody left, all done
530 		 */
531 		if (smallci == NULL) {
532 			ii->ii_ndisk = 0;
533 			break;
534 		}
535 
536 		/*
537 		 * Record starting logical block and component offset
538 		 */
539 		ii->ii_startblk = bn / cs->sc_ileave;
540 		ii->ii_startoff = lbn;
541 
542 		/*
543 		 * Determine how many disks take part in this interleave
544 		 * and record their indices.
545 		 */
546 		ix = 0;
547 		for (ci = cs->sc_cinfo;
548 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
549 			if (ci->ci_size >= smallci->ci_size)
550 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
551 		ii->ii_ndisk = ix;
552 		bn += ix * (smallci->ci_size - size);
553 		lbn = smallci->ci_size / cs->sc_ileave;
554 		size = smallci->ci_size;
555 	}
556 #ifdef DEBUG
557 	if (ccddebug & CCDB_INIT)
558 		printiinfo(cs->sc_itable);
559 #endif
560 }
561 
562 /* ARGSUSED */
563 static int
564 ccdopen(dev_t dev, int flags, int fmt, struct lwp *l)
565 {
566 	int unit = ccdunit(dev);
567 	struct ccd_softc *cs;
568 	struct disklabel *lp;
569 	int error = 0, part, pmask;
570 
571 #ifdef DEBUG
572 	if (ccddebug & CCDB_FOLLOW)
573 		printf("ccdopen(0x%"PRIx64", 0x%x)\n", dev, flags);
574 #endif
575 	if ((cs = ccdget(unit)) == NULL)
576 		return ENXIO;
577 
578 	mutex_enter(&cs->sc_dvlock);
579 
580 	lp = cs->sc_dkdev.dk_label;
581 
582 	part = DISKPART(dev);
583 	pmask = (1 << part);
584 
585 	/*
586 	 * If we're initialized, check to see if there are any other
587 	 * open partitions.  If not, then it's safe to update
588 	 * the in-core disklabel.  Only read the disklabel if it is
589 	 * not already valid.
590 	 */
591 	if ((cs->sc_flags & (CCDF_INITED|CCDF_VLABEL)) == CCDF_INITED &&
592 	    cs->sc_dkdev.dk_openmask == 0)
593 		ccdgetdisklabel(dev);
594 
595 	/* Check that the partition exists. */
596 	if (part != RAW_PART) {
597 		if (((cs->sc_flags & CCDF_INITED) == 0) ||
598 		    ((part >= lp->d_npartitions) ||
599 		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
600 			error = ENXIO;
601 			goto done;
602 		}
603 	}
604 
605 	/* Prevent our unit from being unconfigured while open. */
606 	switch (fmt) {
607 	case S_IFCHR:
608 		cs->sc_dkdev.dk_copenmask |= pmask;
609 		break;
610 
611 	case S_IFBLK:
612 		cs->sc_dkdev.dk_bopenmask |= pmask;
613 		break;
614 	}
615 	cs->sc_dkdev.dk_openmask =
616 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
617 
618  done:
619 	mutex_exit(&cs->sc_dvlock);
620 	return (error);
621 }
622 
623 /* ARGSUSED */
624 static int
625 ccdclose(dev_t dev, int flags, int fmt, struct lwp *l)
626 {
627 	int unit = ccdunit(dev);
628 	struct ccd_softc *cs;
629 	int part;
630 
631 #ifdef DEBUG
632 	if (ccddebug & CCDB_FOLLOW)
633 		printf("ccdclose(0x%"PRIx64", 0x%x)\n", dev, flags);
634 #endif
635 
636 	if ((cs = ccdget(unit)) == NULL)
637 		return ENXIO;
638 
639 	mutex_enter(&cs->sc_dvlock);
640 
641 	part = DISKPART(dev);
642 
643 	/* ...that much closer to allowing unconfiguration... */
644 	switch (fmt) {
645 	case S_IFCHR:
646 		cs->sc_dkdev.dk_copenmask &= ~(1 << part);
647 		break;
648 
649 	case S_IFBLK:
650 		cs->sc_dkdev.dk_bopenmask &= ~(1 << part);
651 		break;
652 	}
653 	cs->sc_dkdev.dk_openmask =
654 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
655 
656 	if (cs->sc_dkdev.dk_openmask == 0) {
657 		if ((cs->sc_flags & CCDF_KLABEL) == 0)
658 			cs->sc_flags &= ~CCDF_VLABEL;
659 	}
660 
661 	mutex_exit(&cs->sc_dvlock);
662 	return (0);
663 }
664 
665 static bool
666 ccdbackoff(struct ccd_softc *cs)
667 {
668 
669 	/* XXX Arbitrary, should be a uvm call. */
670 	return uvmexp.free < (uvmexp.freemin >> 1) &&
671 	    disk_isbusy(&cs->sc_dkdev);
672 }
673 
674 static void
675 ccdthread(void *cookie)
676 {
677 	struct ccd_softc *cs;
678 
679 	cs = cookie;
680 
681 #ifdef DEBUG
682  	if (ccddebug & CCDB_FOLLOW)
683  		printf("ccdthread: hello\n");
684 #endif
685 
686 	mutex_enter(cs->sc_iolock);
687 	while (__predict_true(!cs->sc_zap)) {
688 		if (bufq_peek(cs->sc_bufq) == NULL) {
689 			/* Nothing to do. */
690 			cv_wait(&cs->sc_push, cs->sc_iolock);
691 			continue;
692 		}
693 		if (ccdbackoff(cs)) {
694 			/* Wait for memory to become available. */
695 			(void)cv_timedwait(&cs->sc_push, cs->sc_iolock, 1);
696 			continue;
697 		}
698 #ifdef DEBUG
699  		if (ccddebug & CCDB_FOLLOW)
700  			printf("ccdthread: dispatching I/O\n");
701 #endif
702 		ccdstart(cs);
703 		mutex_enter(cs->sc_iolock);
704 	}
705 	cs->sc_thread = NULL;
706 	mutex_exit(cs->sc_iolock);
707 #ifdef DEBUG
708  	if (ccddebug & CCDB_FOLLOW)
709  		printf("ccdthread: goodbye\n");
710 #endif
711 	kthread_exit(0);
712 }
713 
714 static void
715 ccdstrategy(struct buf *bp)
716 {
717 	int unit = ccdunit(bp->b_dev);
718 	struct ccd_softc *cs;
719 	if ((cs = ccdget(unit)) == NULL)
720 		return;
721 
722 	/* Must be open or reading label. */
723 	KASSERT(cs->sc_dkdev.dk_openmask != 0 ||
724 	    (cs->sc_flags & CCDF_RLABEL) != 0);
725 
726 	mutex_enter(cs->sc_iolock);
727 	/* Synchronize with device init/uninit. */
728 	if (__predict_false((cs->sc_flags & CCDF_INITED) == 0)) {
729 		mutex_exit(cs->sc_iolock);
730 #ifdef DEBUG
731  		if (ccddebug & CCDB_FOLLOW)
732  			printf("ccdstrategy: unit %d: not inited\n", unit);
733 #endif
734  		bp->b_error = ENXIO;
735  		bp->b_resid = bp->b_bcount;
736  		biodone(bp);
737 		return;
738 	}
739 
740 	/* Defer to thread if system is low on memory. */
741 	bufq_put(cs->sc_bufq, bp);
742 	if (__predict_false(ccdbackoff(cs))) {
743 		mutex_exit(cs->sc_iolock);
744 #ifdef DEBUG
745  		if (ccddebug & CCDB_FOLLOW)
746  			printf("ccdstrategy: holding off on I/O\n");
747 #endif
748 		return;
749 	}
750 	ccdstart(cs);
751 }
752 
753 static void
754 ccdstart(struct ccd_softc *cs)
755 {
756 	daddr_t blkno;
757 	int wlabel;
758 	struct disklabel *lp;
759 	long bcount, rcount;
760 	struct ccdbuf *cbp;
761 	char *addr;
762 	daddr_t bn;
763 	vnode_t *vp;
764 	buf_t *bp;
765 
766 	KASSERT(mutex_owned(cs->sc_iolock));
767 
768 	disk_busy(&cs->sc_dkdev);
769 	bp = bufq_get(cs->sc_bufq);
770 	KASSERT(bp != NULL);
771 
772 #ifdef DEBUG
773 	if (ccddebug & CCDB_FOLLOW)
774 		printf("ccdstart(%s, %p)\n", cs->sc_xname, bp);
775 #endif
776 
777 	/* If it's a nil transfer, wake up the top half now. */
778 	if (bp->b_bcount == 0)
779 		goto done;
780 
781 	lp = cs->sc_dkdev.dk_label;
782 
783 	/*
784 	 * Do bounds checking and adjust transfer.  If there's an
785 	 * error, the bounds check will flag that for us.  Convert
786 	 * the partition relative block number to an absolute.
787 	 */
788 	blkno = bp->b_blkno;
789 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
790 	if (DISKPART(bp->b_dev) != RAW_PART) {
791 		if (bounds_check_with_label(&cs->sc_dkdev, bp, wlabel) <= 0)
792 			goto done;
793 		blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset;
794 	}
795 	mutex_exit(cs->sc_iolock);
796 	bp->b_rawblkno = blkno;
797 
798 	/* Allocate the component buffers and start I/O! */
799 	bp->b_resid = bp->b_bcount;
800 	bn = bp->b_rawblkno;
801 	addr = bp->b_data;
802 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
803 		cbp = ccdbuffer(cs, bp, bn, addr, bcount);
804 		rcount = cbp->cb_buf.b_bcount;
805 		bn += btodb(rcount);
806 		addr += rcount;
807 		vp = cbp->cb_buf.b_vp;
808 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
809 			mutex_enter(vp->v_interlock);
810 			vp->v_numoutput++;
811 			mutex_exit(vp->v_interlock);
812 		}
813 		(void)VOP_STRATEGY(vp, &cbp->cb_buf);
814 	}
815 	return;
816 
817  done:
818 	disk_unbusy(&cs->sc_dkdev, 0, 0);
819 	cv_broadcast(&cs->sc_stop);
820 	cv_broadcast(&cs->sc_push);
821 	mutex_exit(cs->sc_iolock);
822 	bp->b_resid = bp->b_bcount;
823 	biodone(bp);
824 }
825 
826 /*
827  * Build a component buffer header.
828  */
829 static struct ccdbuf *
830 ccdbuffer(struct ccd_softc *cs, struct buf *bp, daddr_t bn, void *addr,
831     long bcount)
832 {
833 	struct ccdcinfo *ci;
834 	struct ccdbuf *cbp;
835 	daddr_t cbn, cboff;
836 	u_int64_t cbc;
837 	int ccdisk;
838 
839 #ifdef DEBUG
840 	if (ccddebug & CCDB_IO)
841 		printf("ccdbuffer(%p, %p, %" PRId64 ", %p, %ld)\n",
842 		       cs, bp, bn, addr, bcount);
843 #endif
844 	/*
845 	 * Determine which component bn falls in.
846 	 */
847 	cbn = bn;
848 	cboff = 0;
849 
850 	/*
851 	 * Serially concatenated
852 	 */
853 	if (cs->sc_ileave == 0) {
854 		daddr_t sblk;
855 
856 		sblk = 0;
857 		for (ccdisk = 0, ci = &cs->sc_cinfo[ccdisk];
858 		    cbn >= sblk + ci->ci_size;
859 		    ccdisk++, ci = &cs->sc_cinfo[ccdisk])
860 			sblk += ci->ci_size;
861 		cbn -= sblk;
862 	}
863 	/*
864 	 * Interleaved
865 	 */
866 	else {
867 		struct ccdiinfo *ii;
868 		int off;
869 
870 		cboff = cbn % cs->sc_ileave;
871 		cbn /= cs->sc_ileave;
872 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++)
873 			if (ii->ii_startblk > cbn)
874 				break;
875 		ii--;
876 		off = cbn - ii->ii_startblk;
877 		if (ii->ii_ndisk == 1) {
878 			ccdisk = ii->ii_index[0];
879 			cbn = ii->ii_startoff + off;
880 		} else {
881 			ccdisk = ii->ii_index[off % ii->ii_ndisk];
882 			cbn = ii->ii_startoff + off / ii->ii_ndisk;
883 		}
884 		cbn *= cs->sc_ileave;
885 		ci = &cs->sc_cinfo[ccdisk];
886 	}
887 
888 	/*
889 	 * Fill in the component buf structure.
890 	 */
891 	cbp = CCD_GETBUF();
892 	KASSERT(cbp != NULL);
893 	buf_init(&cbp->cb_buf);
894 	cbp->cb_buf.b_flags = bp->b_flags;
895 	cbp->cb_buf.b_oflags = bp->b_oflags;
896 	cbp->cb_buf.b_cflags = bp->b_cflags;
897 	cbp->cb_buf.b_iodone = ccdiodone;
898 	cbp->cb_buf.b_proc = bp->b_proc;
899 	cbp->cb_buf.b_dev = ci->ci_dev;
900 	cbp->cb_buf.b_blkno = cbn + cboff;
901 	cbp->cb_buf.b_data = addr;
902 	cbp->cb_buf.b_vp = ci->ci_vp;
903 	cbp->cb_buf.b_objlock = ci->ci_vp->v_interlock;
904 	if (cs->sc_ileave == 0)
905 		cbc = dbtob((u_int64_t)(ci->ci_size - cbn));
906 	else
907 		cbc = dbtob((u_int64_t)(cs->sc_ileave - cboff));
908 	cbp->cb_buf.b_bcount = cbc < bcount ? cbc : bcount;
909 
910 	/*
911 	 * context for ccdiodone
912 	 */
913 	cbp->cb_obp = bp;
914 	cbp->cb_sc = cs;
915 	cbp->cb_comp = ccdisk;
916 
917 	BIO_COPYPRIO(&cbp->cb_buf, bp);
918 
919 #ifdef DEBUG
920 	if (ccddebug & CCDB_IO)
921 		printf(" dev 0x%"PRIx64"(u%lu): cbp %p bn %" PRId64 " addr %p"
922 		       " bcnt %d\n",
923 		    ci->ci_dev, (unsigned long) (ci-cs->sc_cinfo), cbp,
924 		    cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
925 		    cbp->cb_buf.b_bcount);
926 #endif
927 
928 	return (cbp);
929 }
930 
931 /*
932  * Called at interrupt time.
933  * Mark the component as done and if all components are done,
934  * take a ccd interrupt.
935  */
936 static void
937 ccdiodone(struct buf *vbp)
938 {
939 	struct ccdbuf *cbp = (struct ccdbuf *) vbp;
940 	struct buf *bp = cbp->cb_obp;
941 	struct ccd_softc *cs = cbp->cb_sc;
942 	int count;
943 
944 #ifdef DEBUG
945 	if (ccddebug & CCDB_FOLLOW)
946 		printf("ccdiodone(%p)\n", cbp);
947 	if (ccddebug & CCDB_IO) {
948 		printf("ccdiodone: bp %p bcount %d resid %d\n",
949 		       bp, bp->b_bcount, bp->b_resid);
950 		printf(" dev 0x%"PRIx64"(u%d), cbp %p bn %" PRId64 " addr %p"
951 		       " bcnt %d\n",
952 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
953 		       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
954 		       cbp->cb_buf.b_bcount);
955 	}
956 #endif
957 
958 	if (cbp->cb_buf.b_error != 0) {
959 		bp->b_error = cbp->cb_buf.b_error;
960 		printf("%s: error %d on component %d\n",
961 		       cs->sc_xname, bp->b_error, cbp->cb_comp);
962 	}
963 	count = cbp->cb_buf.b_bcount;
964 	buf_destroy(&cbp->cb_buf);
965 	CCD_PUTBUF(cbp);
966 
967 	/*
968 	 * If all done, "interrupt".
969 	 */
970 	mutex_enter(cs->sc_iolock);
971 	bp->b_resid -= count;
972 	if (bp->b_resid < 0)
973 		panic("ccdiodone: count");
974 	if (bp->b_resid == 0) {
975 		/*
976 		 * Request is done for better or worse, wakeup the top half.
977 		 */
978 		if (bp->b_error != 0)
979 			bp->b_resid = bp->b_bcount;
980 		disk_unbusy(&cs->sc_dkdev, (bp->b_bcount - bp->b_resid),
981 		    (bp->b_flags & B_READ));
982 		if (!disk_isbusy(&cs->sc_dkdev)) {
983 			if (bufq_peek(cs->sc_bufq) != NULL) {
984 				cv_broadcast(&cs->sc_push);
985 			}
986 			cv_broadcast(&cs->sc_stop);
987 		}
988 		mutex_exit(cs->sc_iolock);
989 		biodone(bp);
990 	} else
991 		mutex_exit(cs->sc_iolock);
992 }
993 
994 /* ARGSUSED */
995 static int
996 ccdread(dev_t dev, struct uio *uio, int flags)
997 {
998 	int unit = ccdunit(dev);
999 	struct ccd_softc *cs;
1000 
1001 #ifdef DEBUG
1002 	if (ccddebug & CCDB_FOLLOW)
1003 		printf("ccdread(0x%"PRIx64", %p)\n", dev, uio);
1004 #endif
1005 	if ((cs = ccdget(unit)) == NULL)
1006 		return 0;
1007 
1008 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1009 	if ((cs->sc_flags & CCDF_INITED) == 0)
1010 		return (ENXIO);
1011 
1012 	return (physio(ccdstrategy, NULL, dev, B_READ, minphys, uio));
1013 }
1014 
1015 /* ARGSUSED */
1016 static int
1017 ccdwrite(dev_t dev, struct uio *uio, int flags)
1018 {
1019 	int unit = ccdunit(dev);
1020 	struct ccd_softc *cs;
1021 
1022 #ifdef DEBUG
1023 	if (ccddebug & CCDB_FOLLOW)
1024 		printf("ccdwrite(0x%"PRIx64", %p)\n", dev, uio);
1025 #endif
1026 	if ((cs = ccdget(unit)) == NULL)
1027 		return ENOENT;
1028 
1029 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1030 	if ((cs->sc_flags & CCDF_INITED) == 0)
1031 		return (ENXIO);
1032 
1033 	return (physio(ccdstrategy, NULL, dev, B_WRITE, minphys, uio));
1034 }
1035 
1036 static int
1037 ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1038 {
1039 	int unit = ccdunit(dev);
1040 	int i, j, lookedup = 0, error = 0;
1041 	int part, pmask;
1042 	struct ccd_softc *cs;
1043 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1044 	kauth_cred_t uc;
1045 	char **cpp;
1046 	struct pathbuf *pb;
1047 	struct vnode **vpp;
1048 #ifdef __HAVE_OLD_DISKLABEL
1049 	struct disklabel newlabel;
1050 #endif
1051 
1052 	if ((cs = ccdget(unit)) == NULL)
1053 		return ENOENT;
1054 	uc = kauth_cred_get();
1055 
1056 	/* Must be open for writes for these commands... */
1057 	switch (cmd) {
1058 	case CCDIOCSET:
1059 	case CCDIOCCLR:
1060 	case DIOCSDINFO:
1061 	case DIOCWDINFO:
1062 #ifdef __HAVE_OLD_DISKLABEL
1063 	case ODIOCSDINFO:
1064 	case ODIOCWDINFO:
1065 #endif
1066 	case DIOCKLABEL:
1067 	case DIOCWLABEL:
1068 		if ((flag & FWRITE) == 0)
1069 			return (EBADF);
1070 	}
1071 
1072 	mutex_enter(&cs->sc_dvlock);
1073 
1074 	/* Must be initialized for these... */
1075 	switch (cmd) {
1076 	case CCDIOCCLR:
1077 	case DIOCGDINFO:
1078 	case DIOCCACHESYNC:
1079 	case DIOCSDINFO:
1080 	case DIOCWDINFO:
1081 	case DIOCGPART:
1082 	case DIOCWLABEL:
1083 	case DIOCKLABEL:
1084 	case DIOCGDEFLABEL:
1085 #ifdef __HAVE_OLD_DISKLABEL
1086 	case ODIOCGDINFO:
1087 	case ODIOCSDINFO:
1088 	case ODIOCWDINFO:
1089 	case ODIOCGDEFLABEL:
1090 #endif
1091 		if ((cs->sc_flags & CCDF_INITED) == 0) {
1092 			error = ENXIO;
1093 			goto out;
1094 		}
1095 	}
1096 
1097 	switch (cmd) {
1098 	case CCDIOCSET:
1099 		if (cs->sc_flags & CCDF_INITED) {
1100 			error = EBUSY;
1101 			goto out;
1102 		}
1103 
1104 		/* Validate the flags. */
1105 		if ((ccio->ccio_flags & CCDF_USERMASK) != ccio->ccio_flags) {
1106 			error = EINVAL;
1107 			goto out;
1108 		}
1109 
1110 		if (ccio->ccio_ndisks > CCD_MAXNDISKS ||
1111 		    ccio->ccio_ndisks == 0) {
1112 			error = EINVAL;
1113 			goto out;
1114 		}
1115 
1116 		/* Fill in some important bits. */
1117 		cs->sc_ileave = ccio->ccio_ileave;
1118 		cs->sc_nccdisks = ccio->ccio_ndisks;
1119 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1120 
1121 		/*
1122 		 * Allocate space for and copy in the array of
1123 		 * componet pathnames and device numbers.
1124 		 */
1125 		cpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*cpp), KM_SLEEP);
1126 		vpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*vpp), KM_SLEEP);
1127 		error = copyin(ccio->ccio_disks, cpp,
1128 		    ccio->ccio_ndisks * sizeof(*cpp));
1129 		if (error) {
1130 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1131 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1132 			goto out;
1133 		}
1134 
1135 #ifdef DEBUG
1136 		if (ccddebug & CCDB_INIT)
1137 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1138 				printf("ccdioctl: component %d: %p\n",
1139 				    i, cpp[i]);
1140 #endif
1141 
1142 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1143 #ifdef DEBUG
1144 			if (ccddebug & CCDB_INIT)
1145 				printf("ccdioctl: lookedup = %d\n", lookedup);
1146 #endif
1147 			error = pathbuf_copyin(cpp[i], &pb);
1148 			if (error == 0) {
1149 				error = dk_lookup(pb, l, &vpp[i]);
1150 			}
1151 			pathbuf_destroy(pb);
1152 			if (error != 0) {
1153 				for (j = 0; j < lookedup; ++j)
1154 					(void)vn_close(vpp[j], FREAD|FWRITE,
1155 					    uc);
1156 				kmem_free(vpp, ccio->ccio_ndisks *
1157 				    sizeof(*vpp));
1158 				kmem_free(cpp, ccio->ccio_ndisks *
1159 				    sizeof(*cpp));
1160 				goto out;
1161 			}
1162 			++lookedup;
1163 		}
1164 
1165 		/* Attach the disk. */
1166 		disk_attach(&cs->sc_dkdev);
1167 		bufq_alloc(&cs->sc_bufq, "fcfs", 0);
1168 
1169 		/*
1170 		 * Initialize the ccd.  Fills in the softc for us.
1171 		 */
1172 		if ((error = ccdinit(cs, cpp, vpp, l)) != 0) {
1173 			for (j = 0; j < lookedup; ++j)
1174 				(void)vn_close(vpp[j], FREAD|FWRITE,
1175 				    uc);
1176 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1177 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1178 			disk_detach(&cs->sc_dkdev);
1179 			bufq_free(cs->sc_bufq);
1180 			goto out;
1181 		}
1182 
1183 		/* We can free the temporary variables now. */
1184 		kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1185 		kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1186 
1187 		/*
1188 		 * The ccd has been successfully initialized, so
1189 		 * we can place it into the array.  Don't try to
1190 		 * read the disklabel until the disk has been attached,
1191 		 * because space for the disklabel is allocated
1192 		 * in disk_attach();
1193 		 */
1194 		ccio->ccio_unit = unit;
1195 		ccio->ccio_size = cs->sc_size;
1196 
1197 		/* Try and read the disklabel. */
1198 		ccdgetdisklabel(dev);
1199 		break;
1200 
1201 	case CCDIOCCLR:
1202 		/*
1203 		 * Don't unconfigure if any other partitions are open
1204 		 * or if both the character and block flavors of this
1205 		 * partition are open.
1206 		 */
1207 		part = DISKPART(dev);
1208 		pmask = (1 << part);
1209 		if ((cs->sc_dkdev.dk_openmask & ~pmask) ||
1210 		    ((cs->sc_dkdev.dk_bopenmask & pmask) &&
1211 		    (cs->sc_dkdev.dk_copenmask & pmask))) {
1212 			error = EBUSY;
1213 			goto out;
1214 		}
1215 
1216 		/* Stop new I/O, wait for in-flight I/O to complete. */
1217 		mutex_enter(cs->sc_iolock);
1218 		cs->sc_flags &= ~(CCDF_INITED|CCDF_VLABEL);
1219 		cs->sc_zap = true;
1220 		while (disk_isbusy(&cs->sc_dkdev) ||
1221 		    bufq_peek(cs->sc_bufq) != NULL ||
1222 		    cs->sc_thread != NULL) {
1223 			cv_broadcast(&cs->sc_push);
1224 			(void)cv_timedwait(&cs->sc_stop, cs->sc_iolock, hz);
1225 		}
1226 		mutex_exit(cs->sc_iolock);
1227 
1228 		/*
1229 		 * Free ccd_softc information and clear entry.
1230 		 */
1231 
1232 		/* Close the components and free their pathnames. */
1233 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1234 			/*
1235 			 * XXX: this close could potentially fail and
1236 			 * cause Bad Things.  Maybe we need to force
1237 			 * the close to happen?
1238 			 */
1239 #ifdef DEBUG
1240 			if (ccddebug & CCDB_VNODE)
1241 				vprint("CCDIOCCLR: vnode info",
1242 				    cs->sc_cinfo[i].ci_vp);
1243 #endif
1244 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1245 			    uc);
1246 			kmem_free(cs->sc_cinfo[i].ci_path,
1247 			    cs->sc_cinfo[i].ci_pathlen);
1248 		}
1249 
1250 		/* Free interleave index. */
1251 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) {
1252 			kmem_free(cs->sc_itable[i].ii_index,
1253 			    cs->sc_itable[i].ii_indexsz);
1254 		}
1255 
1256 		/* Free component info and interleave table. */
1257 		kmem_free(cs->sc_cinfo, cs->sc_nccdisks *
1258 		    sizeof(struct ccdcinfo));
1259 		kmem_free(cs->sc_itable, (cs->sc_nccdisks + 1) *
1260 		    sizeof(struct ccdiinfo));
1261 
1262 		/* Detatch the disk. */
1263 		disk_detach(&cs->sc_dkdev);
1264 		bufq_free(cs->sc_bufq);
1265 		ccdput(cs);
1266 		/* Don't break, otherwise cs is read again. */
1267 		return 0;
1268 
1269 	case DIOCGDINFO:
1270 		*(struct disklabel *)data = *(cs->sc_dkdev.dk_label);
1271 		break;
1272 
1273 #ifdef __HAVE_OLD_DISKLABEL
1274 	case ODIOCGDINFO:
1275 		newlabel = *(cs->sc_dkdev.dk_label);
1276 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1277 			return ENOTTY;
1278 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1279 		break;
1280 #endif
1281 
1282 	case DIOCGPART:
1283 		((struct partinfo *)data)->disklab = cs->sc_dkdev.dk_label;
1284 		((struct partinfo *)data)->part =
1285 		    &cs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1286 		break;
1287 
1288 	case DIOCCACHESYNC:
1289 		/*
1290 		 * XXX Do we really need to care about having a writable
1291 		 * file descriptor here?
1292 		 */
1293 		if ((flag & FWRITE) == 0)
1294 			return (EBADF);
1295 
1296 		/*
1297 		 * We pass this call down to all components and report
1298 		 * the first error we encounter.
1299 		 */
1300 		for (error = 0, i = 0; i < cs->sc_nccdisks; i++) {
1301 			j = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, data,
1302 				      flag, uc);
1303 			if (j != 0 && error == 0)
1304 				error = j;
1305 		}
1306 		break;
1307 
1308 	case DIOCWDINFO:
1309 	case DIOCSDINFO:
1310 #ifdef __HAVE_OLD_DISKLABEL
1311 	case ODIOCWDINFO:
1312 	case ODIOCSDINFO:
1313 #endif
1314 	{
1315 		struct disklabel *lp;
1316 #ifdef __HAVE_OLD_DISKLABEL
1317 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1318 			memset(&newlabel, 0, sizeof newlabel);
1319 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1320 			lp = &newlabel;
1321 		} else
1322 #endif
1323 		lp = (struct disklabel *)data;
1324 
1325 		cs->sc_flags |= CCDF_LABELLING;
1326 
1327 		error = setdisklabel(cs->sc_dkdev.dk_label,
1328 		    lp, 0, cs->sc_dkdev.dk_cpulabel);
1329 		if (error == 0) {
1330 			if (cmd == DIOCWDINFO
1331 #ifdef __HAVE_OLD_DISKLABEL
1332 			    || cmd == ODIOCWDINFO
1333 #endif
1334 			   )
1335 				error = writedisklabel(CCDLABELDEV(dev),
1336 				    ccdstrategy, cs->sc_dkdev.dk_label,
1337 				    cs->sc_dkdev.dk_cpulabel);
1338 		}
1339 
1340 		cs->sc_flags &= ~CCDF_LABELLING;
1341 		break;
1342 	}
1343 
1344 	case DIOCKLABEL:
1345 		if (*(int *)data != 0)
1346 			cs->sc_flags |= CCDF_KLABEL;
1347 		else
1348 			cs->sc_flags &= ~CCDF_KLABEL;
1349 		break;
1350 
1351 	case DIOCWLABEL:
1352 		if (*(int *)data != 0)
1353 			cs->sc_flags |= CCDF_WLABEL;
1354 		else
1355 			cs->sc_flags &= ~CCDF_WLABEL;
1356 		break;
1357 
1358 	case DIOCGDEFLABEL:
1359 		ccdgetdefaultlabel(cs, (struct disklabel *)data);
1360 		break;
1361 
1362 #ifdef __HAVE_OLD_DISKLABEL
1363 	case ODIOCGDEFLABEL:
1364 		ccdgetdefaultlabel(cs, &newlabel);
1365 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1366 			return ENOTTY;
1367 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1368 		break;
1369 #endif
1370 
1371 	default:
1372 		error = ENOTTY;
1373 	}
1374 
1375  out:
1376 	mutex_exit(&cs->sc_dvlock);
1377 	return (error);
1378 }
1379 
1380 static int
1381 ccdsize(dev_t dev)
1382 {
1383 	struct ccd_softc *cs;
1384 	struct disklabel *lp;
1385 	int part, unit, omask, size;
1386 
1387 	unit = ccdunit(dev);
1388 	if ((cs = ccdget(unit)) == NULL)
1389 		return -1;
1390 
1391 	if ((cs->sc_flags & CCDF_INITED) == 0)
1392 		return (-1);
1393 
1394 	part = DISKPART(dev);
1395 	omask = cs->sc_dkdev.dk_openmask & (1 << part);
1396 	lp = cs->sc_dkdev.dk_label;
1397 
1398 	if (omask == 0 && ccdopen(dev, 0, S_IFBLK, curlwp))
1399 		return (-1);
1400 
1401 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1402 		size = -1;
1403 	else
1404 		size = lp->d_partitions[part].p_size *
1405 		    (lp->d_secsize / DEV_BSIZE);
1406 
1407 	if (omask == 0 && ccdclose(dev, 0, S_IFBLK, curlwp))
1408 		return (-1);
1409 
1410 	return (size);
1411 }
1412 
1413 static void
1414 ccdgetdefaultlabel(struct ccd_softc *cs, struct disklabel *lp)
1415 {
1416 	struct ccdgeom *ccg = &cs->sc_geom;
1417 
1418 	memset(lp, 0, sizeof(*lp));
1419 
1420 	lp->d_secperunit = cs->sc_size;
1421 	lp->d_secsize = ccg->ccg_secsize;
1422 	lp->d_nsectors = ccg->ccg_nsectors;
1423 	lp->d_ntracks = ccg->ccg_ntracks;
1424 	lp->d_ncylinders = ccg->ccg_ncylinders;
1425 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1426 
1427 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1428 	lp->d_type = DTYPE_CCD;
1429 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1430 	lp->d_rpm = 3600;
1431 	lp->d_interleave = 1;
1432 	lp->d_flags = 0;
1433 
1434 	lp->d_partitions[RAW_PART].p_offset = 0;
1435 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1436 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1437 	lp->d_npartitions = RAW_PART + 1;
1438 
1439 	lp->d_magic = DISKMAGIC;
1440 	lp->d_magic2 = DISKMAGIC;
1441 	lp->d_checksum = dkcksum(cs->sc_dkdev.dk_label);
1442 }
1443 
1444 /*
1445  * Read the disklabel from the ccd.  If one is not present, fake one
1446  * up.
1447  */
1448 static void
1449 ccdgetdisklabel(dev_t dev)
1450 {
1451 	int unit = ccdunit(dev);
1452 	struct ccd_softc *cs;
1453 	const char *errstring;
1454 	struct disklabel *lp;
1455 	struct cpu_disklabel *clp;
1456 
1457 	if ((cs = ccdget(unit)) == NULL)
1458 		return;
1459 	lp = cs->sc_dkdev.dk_label;
1460 	clp = cs->sc_dkdev.dk_cpulabel;
1461 	KASSERT(mutex_owned(&cs->sc_dvlock));
1462 
1463 	memset(clp, 0, sizeof(*clp));
1464 
1465 	ccdgetdefaultlabel(cs, lp);
1466 
1467 	/*
1468 	 * Call the generic disklabel extraction routine.
1469 	 */
1470 	cs->sc_flags |= CCDF_RLABEL;
1471 	if ((cs->sc_flags & CCDF_NOLABEL) != 0)
1472 		errstring = "CCDF_NOLABEL set; ignoring on-disk label";
1473 	else
1474 		errstring = readdisklabel(CCDLABELDEV(dev), ccdstrategy,
1475 		    cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel);
1476 	if (errstring)
1477 		ccdmakedisklabel(cs);
1478 	else {
1479 		int i;
1480 		struct partition *pp;
1481 
1482 		/*
1483 		 * Sanity check whether the found disklabel is valid.
1484 		 *
1485 		 * This is necessary since total size of ccd may vary
1486 		 * when an interleave is changed even though exactly
1487 		 * same componets are used, and old disklabel may used
1488 		 * if that is found.
1489 		 */
1490 		if (lp->d_secperunit != cs->sc_size)
1491 			printf("WARNING: %s: "
1492 			    "total sector size in disklabel (%d) != "
1493 			    "the size of ccd (%lu)\n", cs->sc_xname,
1494 			    lp->d_secperunit, (u_long)cs->sc_size);
1495 		for (i = 0; i < lp->d_npartitions; i++) {
1496 			pp = &lp->d_partitions[i];
1497 			if (pp->p_offset + pp->p_size > cs->sc_size)
1498 				printf("WARNING: %s: end of partition `%c' "
1499 				    "exceeds the size of ccd (%lu)\n",
1500 				    cs->sc_xname, 'a' + i, (u_long)cs->sc_size);
1501 		}
1502 	}
1503 
1504 #ifdef DEBUG
1505 	/* It's actually extremely common to have unlabeled ccds. */
1506 	if (ccddebug & CCDB_LABEL)
1507 		if (errstring != NULL)
1508 			printf("%s: %s\n", cs->sc_xname, errstring);
1509 #endif
1510 
1511 	/* In-core label now valid. */
1512 	cs->sc_flags = (cs->sc_flags | CCDF_VLABEL) & ~CCDF_RLABEL;
1513 }
1514 
1515 /*
1516  * Take care of things one might want to take care of in the event
1517  * that a disklabel isn't present.
1518  */
1519 static void
1520 ccdmakedisklabel(struct ccd_softc *cs)
1521 {
1522 	struct disklabel *lp = cs->sc_dkdev.dk_label;
1523 
1524 	/*
1525 	 * For historical reasons, if there's no disklabel present
1526 	 * the raw partition must be marked FS_BSDFFS.
1527 	 */
1528 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1529 
1530 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1531 
1532 	lp->d_checksum = dkcksum(lp);
1533 }
1534 
1535 #ifdef DEBUG
1536 static void
1537 printiinfo(struct ccdiinfo *ii)
1538 {
1539 	int ix, i;
1540 
1541 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1542 		printf(" itab[%d]: #dk %d sblk %" PRId64 " soff %" PRId64,
1543 		    ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1544 		for (i = 0; i < ii->ii_ndisk; i++)
1545 			printf(" %d", ii->ii_index[i]);
1546 		printf("\n");
1547 	}
1548 }
1549 #endif
1550 
1551 MODULE(MODULE_CLASS_DRIVER, ccd, "dk_subr");
1552 
1553 static int
1554 ccd_modcmd(modcmd_t cmd, void *arg)
1555 {
1556 	int error = 0;
1557 #ifdef _MODULE
1558 	int bmajor = -1, cmajor = -1;
1559 #endif
1560 
1561 
1562 	switch (cmd) {
1563 	case MODULE_CMD_INIT:
1564 #ifdef _MODULE
1565 		ccdattach(4);
1566 
1567 		return devsw_attach("ccd", &ccd_bdevsw, &bmajor,
1568 		    &ccd_cdevsw, &cmajor);
1569 #endif
1570 		break;
1571 
1572 	case MODULE_CMD_FINI:
1573 #ifdef _MODULE
1574 		return devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
1575 #endif
1576 		break;
1577 
1578 	case MODULE_CMD_STAT:
1579 		return ENOTTY;
1580 
1581 	default:
1582 		return ENOTTY;
1583 	}
1584 
1585 	return error;
1586 }
1587 
1588 static int
1589 ccd_units_sysctl(SYSCTLFN_ARGS)
1590 {
1591 	struct sysctlnode node;
1592 	struct ccd_softc *sc;
1593 	int error, i, nccd, *units;
1594 	size_t size;
1595 
1596 	nccd = 0;
1597 	mutex_enter(&ccd_lock);
1598 	LIST_FOREACH(sc, &ccds, sc_link)
1599 		nccd++;
1600 	mutex_exit(&ccd_lock);
1601 
1602 	if (nccd != 0) {
1603 		size = nccd * sizeof(*units);
1604 		units = kmem_zalloc(size, KM_SLEEP);
1605 		if (units == NULL)
1606 			return ENOMEM;
1607 
1608 		i = 0;
1609 		mutex_enter(&ccd_lock);
1610 		LIST_FOREACH(sc, &ccds, sc_link) {
1611 			if (i >= nccd)
1612 				break;
1613 			units[i] = sc->sc_unit;
1614 		}
1615 		mutex_exit(&ccd_lock);
1616 	} else {
1617 		units = NULL;
1618 		size = 0;
1619 	}
1620 
1621 	node = *rnode;
1622 	node.sysctl_data = units;
1623 	node.sysctl_size = size;
1624 
1625 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1626 	if (units)
1627 		kmem_free(units, size);
1628 	return error;
1629 }
1630 
1631 static int
1632 ccd_info_sysctl(SYSCTLFN_ARGS)
1633 {
1634 	struct sysctlnode node;
1635 	struct ccddiskinfo ccd;
1636 	struct ccd_softc *sc;
1637 	int unit;
1638 
1639 	if (newp == NULL || newlen != sizeof(int))
1640 		return EINVAL;
1641 
1642 	unit = *(const int *)newp;
1643 	newp = NULL;
1644 	newlen = 0;
1645 	ccd.ccd_ndisks = ~0;
1646 	mutex_enter(&ccd_lock);
1647 	LIST_FOREACH(sc, &ccds, sc_link) {
1648 		if (sc->sc_unit == unit) {
1649 			ccd.ccd_ileave = sc->sc_ileave;
1650 			ccd.ccd_size = sc->sc_size;
1651 			ccd.ccd_ndisks = sc->sc_nccdisks;
1652 			ccd.ccd_flags = sc->sc_flags;
1653 			break;
1654 		}
1655 	}
1656 	mutex_exit(&ccd_lock);
1657 
1658 	if (ccd.ccd_ndisks == ~0)
1659 		return ENOENT;
1660 
1661 	node = *rnode;
1662 	node.sysctl_data = &ccd;
1663 	node.sysctl_size = sizeof(ccd);
1664 
1665 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1666 }
1667 
1668 static int
1669 ccd_components_sysctl(SYSCTLFN_ARGS)
1670 {
1671 	struct sysctlnode node;
1672 	int error, unit;
1673 	size_t size;
1674 	char *names, *p, *ep;
1675 	struct ccd_softc *sc;
1676 
1677 	if (newp == NULL || newlen != sizeof(int))
1678 		return EINVAL;
1679 
1680 	size = 0;
1681 	unit = *(const int *)newp;
1682 	newp = NULL;
1683 	newlen = 0;
1684 	mutex_enter(&ccd_lock);
1685 	LIST_FOREACH(sc, &ccds, sc_link)
1686 		if (sc->sc_unit == unit) {
1687 			for (size_t i = 0; i < sc->sc_nccdisks; i++)
1688 				size += strlen(sc->sc_cinfo[i].ci_path) + 1;
1689 			break;
1690 		}
1691 	mutex_exit(&ccd_lock);
1692 
1693 	if (size == 0)
1694 		return ENOENT;
1695 	names = kmem_zalloc(size, KM_SLEEP);
1696 	if (names == NULL)
1697 		return ENOMEM;
1698 
1699 	p = names;
1700 	ep = names + size;
1701 	mutex_enter(&ccd_lock);
1702 	LIST_FOREACH(sc, &ccds, sc_link)
1703 		if (sc->sc_unit == unit) {
1704 			for (size_t i = 0; i < sc->sc_nccdisks; i++) {
1705 				char *d = sc->sc_cinfo[i].ci_path;
1706 				while (p < ep && (*p++ = *d++) != '\0')
1707 					continue;
1708 			}
1709 			break;
1710 		}
1711 	mutex_exit(&ccd_lock);
1712 
1713 	node = *rnode;
1714 	node.sysctl_data = names;
1715 	node.sysctl_size = ep - names;
1716 
1717 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1718 	kmem_free(names, size);
1719 	return error;
1720 }
1721 
1722 SYSCTL_SETUP(sysctl_kern_ccd_setup, "sysctl kern.ccd subtree setup")
1723 {
1724 	const struct sysctlnode *node = NULL;
1725 
1726 	sysctl_createv(clog, 0, NULL, &node,
1727 	    CTLFLAG_PERMANENT,
1728 	    CTLTYPE_NODE, "ccd",
1729 	    SYSCTL_DESCR("ConCatenated Disk state"),
1730 	    NULL, 0, NULL, 0,
1731 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1732 
1733 	if (node == NULL)
1734 		return;
1735 
1736 	sysctl_createv(clog, 0, &node, NULL,
1737 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
1738 	    CTLTYPE_STRUCT, "units",
1739 	    SYSCTL_DESCR("List of ccd unit numbers"),
1740 	    ccd_units_sysctl, 0, NULL, 0,
1741 	    CTL_CREATE, CTL_EOL);
1742 	sysctl_createv(clog, 0, &node, NULL,
1743 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1744 	    CTLTYPE_STRUCT, "info",
1745 	    SYSCTL_DESCR("Information about a CCD unit"),
1746 	    ccd_info_sysctl, 0, NULL, 0,
1747 	    CTL_CREATE, CTL_EOL);
1748 	sysctl_createv(clog, 0, &node, NULL,
1749 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1750 	    CTLTYPE_STRUCT, "components",
1751 	    SYSCTL_DESCR("Information about CCD components"),
1752 	    ccd_components_sysctl, 0, NULL, 0,
1753 	    CTL_CREATE, CTL_EOL);
1754 }
1755