xref: /netbsd-src/sys/dev/ccd.c (revision 2c6fc41c810f5088457889d00eba558e8bc74d9e)
1 /*	$NetBSD: ccd.c,v 1.148 2014/04/06 00:56:39 joerg Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe, and by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * "Concatenated" disk driver.
72  *
73  * Notes on concurrency:
74  *
75  * => sc_dvlock serializes access to the device nodes, excluding block I/O.
76  *
77  * => sc_iolock serializes access to (sc_flags & CCDF_INITED), disk stats,
78  *    sc_stop, sc_bufq and b_resid from master buffers.
79  *
80  * => a combination of CCDF_INITED, sc_inflight, and sc_iolock is used to
81  *    serialize I/O and configuration changes.
82  *
83  * => the in-core disk label does not change while the device is open.
84  *
85  * On memory consumption: ccd fans out I/O requests and so needs to
86  * allocate memory.  If the system is desperately low on memory, we
87  * single thread I/O.
88  */
89 
90 #include <sys/cdefs.h>
91 __KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.148 2014/04/06 00:56:39 joerg Exp $");
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/proc.h>
97 #include <sys/errno.h>
98 #include <sys/buf.h>
99 #include <sys/kmem.h>
100 #include <sys/pool.h>
101 #include <sys/module.h>
102 #include <sys/namei.h>
103 #include <sys/stat.h>
104 #include <sys/ioctl.h>
105 #include <sys/disklabel.h>
106 #include <sys/device.h>
107 #include <sys/disk.h>
108 #include <sys/syslog.h>
109 #include <sys/fcntl.h>
110 #include <sys/vnode.h>
111 #include <sys/conf.h>
112 #include <sys/mutex.h>
113 #include <sys/queue.h>
114 #include <sys/kauth.h>
115 #include <sys/kthread.h>
116 #include <sys/bufq.h>
117 #include <sys/sysctl.h>
118 
119 #include <uvm/uvm_extern.h>
120 
121 #include <dev/ccdvar.h>
122 #include <dev/dkvar.h>
123 
124 #if defined(CCDDEBUG) && !defined(DEBUG)
125 #define DEBUG
126 #endif
127 
128 #ifdef DEBUG
129 #define CCDB_FOLLOW	0x01
130 #define CCDB_INIT	0x02
131 #define CCDB_IO		0x04
132 #define CCDB_LABEL	0x08
133 #define CCDB_VNODE	0x10
134 int ccddebug = 0x00;
135 #endif
136 
137 #define	ccdunit(x)	DISKUNIT(x)
138 
139 struct ccdbuf {
140 	struct buf	cb_buf;		/* new I/O buf */
141 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
142 	struct ccd_softc *cb_sc;	/* pointer to ccd softc */
143 	int		cb_comp;	/* target component */
144 	SIMPLEQ_ENTRY(ccdbuf) cb_q;	/* fifo of component buffers */
145 };
146 
147 /* component buffer pool */
148 static pool_cache_t ccd_cache;
149 
150 #define	CCD_GETBUF()		pool_cache_get(ccd_cache, PR_WAITOK)
151 #define	CCD_PUTBUF(cbp)		pool_cache_put(ccd_cache, cbp)
152 
153 #define CCDLABELDEV(dev)	\
154 	(MAKEDISKDEV(major((dev)), ccdunit((dev)), RAW_PART))
155 
156 /* called by main() at boot time */
157 void	ccdattach(int);
158 
159 /* called by biodone() at interrupt time */
160 static void	ccdiodone(struct buf *);
161 
162 static void	ccdinterleave(struct ccd_softc *);
163 static int	ccdinit(struct ccd_softc *, char **, struct vnode **,
164 		    struct lwp *);
165 static struct ccdbuf *ccdbuffer(struct ccd_softc *, struct buf *,
166 		    daddr_t, void *, long);
167 static void	ccdgetdefaultlabel(struct ccd_softc *, struct disklabel *);
168 static void	ccdgetdisklabel(dev_t);
169 static void	ccdmakedisklabel(struct ccd_softc *);
170 static void	ccdstart(struct ccd_softc *);
171 static void	ccdthread(void *);
172 
173 static dev_type_open(ccdopen);
174 static dev_type_close(ccdclose);
175 static dev_type_read(ccdread);
176 static dev_type_write(ccdwrite);
177 static dev_type_ioctl(ccdioctl);
178 static dev_type_strategy(ccdstrategy);
179 static dev_type_size(ccdsize);
180 
181 const struct bdevsw ccd_bdevsw = {
182 	.d_open = ccdopen,
183 	.d_close = ccdclose,
184 	.d_strategy = ccdstrategy,
185 	.d_ioctl = ccdioctl,
186 	.d_dump = nodump,
187 	.d_psize = ccdsize,
188 	.d_flag = D_DISK | D_MPSAFE
189 };
190 
191 const struct cdevsw ccd_cdevsw = {
192 	.d_open = ccdopen,
193 	.d_close = ccdclose,
194 	.d_read = ccdread,
195 	.d_write = ccdwrite,
196 	.d_ioctl = ccdioctl,
197 	.d_stop = nostop,
198 	.d_tty = notty,
199 	.d_poll = nopoll,
200 	.d_mmap = nommap,
201 	.d_kqfilter = nokqfilter,
202 	.d_flag = D_DISK | D_MPSAFE
203 };
204 
205 #ifdef DEBUG
206 static	void printiinfo(struct ccdiinfo *);
207 #endif
208 
209 static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds);
210 static kmutex_t ccd_lock;
211 
212 static struct ccd_softc *
213 ccdcreate(int unit) {
214 	struct ccd_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
215 	if (sc == NULL) {
216 #ifdef DIAGNOSTIC
217 		printf("%s: out of memory\n", __func__);
218 #endif
219 		return NULL;
220 	}
221 	/* Initialize per-softc structures. */
222 	snprintf(sc->sc_xname, sizeof(sc->sc_xname), "ccd%d", unit);
223 	mutex_init(&sc->sc_dvlock, MUTEX_DEFAULT, IPL_NONE);
224 	sc->sc_iolock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
225 	cv_init(&sc->sc_stop, "ccdstop");
226 	cv_init(&sc->sc_push, "ccdthr");
227 	disk_init(&sc->sc_dkdev, sc->sc_xname, NULL); /* XXX */
228 	return sc;
229 }
230 
231 static void
232 ccddestroy(struct ccd_softc *sc) {
233 	mutex_obj_free(sc->sc_iolock);
234 	mutex_exit(&sc->sc_dvlock);
235 	mutex_destroy(&sc->sc_dvlock);
236 	cv_destroy(&sc->sc_stop);
237 	cv_destroy(&sc->sc_push);
238 	disk_destroy(&sc->sc_dkdev);
239 	kmem_free(sc, sizeof(*sc));
240 }
241 
242 static struct ccd_softc *
243 ccdget(int unit) {
244 	struct ccd_softc *sc;
245 	if (unit < 0) {
246 #ifdef DIAGNOSTIC
247 		panic("%s: unit %d!", __func__, unit);
248 #endif
249 		return NULL;
250 	}
251 	mutex_enter(&ccd_lock);
252 	LIST_FOREACH(sc, &ccds, sc_link) {
253 		if (sc->sc_unit == unit) {
254 			mutex_exit(&ccd_lock);
255 			return sc;
256 		}
257 	}
258 	mutex_exit(&ccd_lock);
259 	if ((sc = ccdcreate(unit)) == NULL)
260 		return NULL;
261 	mutex_enter(&ccd_lock);
262 	LIST_INSERT_HEAD(&ccds, sc, sc_link);
263 	mutex_exit(&ccd_lock);
264 	return sc;
265 }
266 
267 static void
268 ccdput(struct ccd_softc *sc) {
269 	mutex_enter(&ccd_lock);
270 	LIST_REMOVE(sc, sc_link);
271 	mutex_exit(&ccd_lock);
272 	ccddestroy(sc);
273 }
274 
275 /*
276  * Called by main() during pseudo-device attachment.  All we need
277  * to do is allocate enough space for devices to be configured later.
278  */
279 void
280 ccdattach(int num)
281 {
282 	mutex_init(&ccd_lock, MUTEX_DEFAULT, IPL_NONE);
283 
284 	/* Initialize the component buffer pool. */
285 	ccd_cache = pool_cache_init(sizeof(struct ccdbuf), 0,
286 	    0, 0, "ccdbuf", NULL, IPL_BIO, NULL, NULL, NULL);
287 }
288 
289 static int
290 ccdinit(struct ccd_softc *cs, char **cpaths, struct vnode **vpp,
291     struct lwp *l)
292 {
293 	struct ccdcinfo *ci = NULL;
294 	int ix;
295 	struct vattr va;
296 	struct ccdgeom *ccg = &cs->sc_geom;
297 	char *tmppath;
298 	int error, path_alloced;
299 	uint64_t psize, minsize;
300 	unsigned secsize, maxsecsize;
301 
302 #ifdef DEBUG
303 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
304 		printf("%s: ccdinit\n", cs->sc_xname);
305 #endif
306 
307 	/* Allocate space for the component info. */
308 	cs->sc_cinfo = kmem_alloc(cs->sc_nccdisks * sizeof(*cs->sc_cinfo),
309 	    KM_SLEEP);
310 	tmppath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
311 
312 	cs->sc_size = 0;
313 
314 	/*
315 	 * Verify that each component piece exists and record
316 	 * relevant information about it.
317 	 */
318 	maxsecsize = 0;
319 	minsize = 0;
320 	for (ix = 0, path_alloced = 0; ix < cs->sc_nccdisks; ix++) {
321 		ci = &cs->sc_cinfo[ix];
322 		ci->ci_vp = vpp[ix];
323 
324 		/*
325 		 * Copy in the pathname of the component.
326 		 */
327 		memset(tmppath, 0, MAXPATHLEN);	/* sanity */
328 		error = copyinstr(cpaths[ix], tmppath,
329 		    MAXPATHLEN, &ci->ci_pathlen);
330 		if (ci->ci_pathlen == 0)
331 			error = EINVAL;
332 		if (error) {
333 #ifdef DEBUG
334 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
335 				printf("%s: can't copy path, error = %d\n",
336 				    cs->sc_xname, error);
337 #endif
338 			goto out;
339 		}
340 		ci->ci_path = kmem_alloc(ci->ci_pathlen, KM_SLEEP);
341 		memcpy(ci->ci_path, tmppath, ci->ci_pathlen);
342 		path_alloced++;
343 
344 		/*
345 		 * XXX: Cache the component's dev_t.
346 		 */
347 		vn_lock(vpp[ix], LK_SHARED | LK_RETRY);
348 		error = VOP_GETATTR(vpp[ix], &va, l->l_cred);
349 		VOP_UNLOCK(vpp[ix]);
350 		if (error != 0) {
351 #ifdef DEBUG
352 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
353 				printf("%s: %s: getattr failed %s = %d\n",
354 				    cs->sc_xname, ci->ci_path,
355 				    "error", error);
356 #endif
357 			goto out;
358 		}
359 		ci->ci_dev = va.va_rdev;
360 
361 		/*
362 		 * Get partition information for the component.
363 		 */
364 		error = getdisksize(vpp[ix], &psize, &secsize);
365 		if (error) {
366 #ifdef DEBUG
367 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
368 				 printf("%s: %s: disksize failed, error = %d\n",
369 				     cs->sc_xname, ci->ci_path, error);
370 #endif
371 			goto out;
372 		}
373 
374 		/*
375 		 * Calculate the size, truncating to an interleave
376 		 * boundary if necessary.
377 		 */
378 		maxsecsize = secsize > maxsecsize ? secsize : maxsecsize;
379 		if (cs->sc_ileave > 1)
380 			psize -= psize % cs->sc_ileave;
381 
382 		if (psize == 0) {
383 #ifdef DEBUG
384 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
385 				printf("%s: %s: size == 0\n",
386 				    cs->sc_xname, ci->ci_path);
387 #endif
388 			error = ENODEV;
389 			goto out;
390 		}
391 
392 		if (minsize == 0 || psize < minsize)
393 			minsize = psize;
394 		ci->ci_size = psize;
395 		cs->sc_size += psize;
396 	}
397 
398 	/*
399 	 * Don't allow the interleave to be smaller than
400 	 * the biggest component sector.
401 	 */
402 	if ((cs->sc_ileave > 0) &&
403 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
404 #ifdef DEBUG
405 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
406 			printf("%s: interleave must be at least %d\n",
407 			    cs->sc_xname, (maxsecsize / DEV_BSIZE));
408 #endif
409 		error = EINVAL;
410 		goto out;
411 	}
412 
413 	/*
414 	 * If uniform interleave is desired set all sizes to that of
415 	 * the smallest component.
416 	 */
417 	if (cs->sc_flags & CCDF_UNIFORM) {
418 		for (ci = cs->sc_cinfo;
419 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
420 			ci->ci_size = minsize;
421 
422 		cs->sc_size = cs->sc_nccdisks * minsize;
423 	}
424 
425 	/*
426 	 * Construct the interleave table.
427 	 */
428 	ccdinterleave(cs);
429 
430 	/*
431 	 * Create pseudo-geometry based on 1MB cylinders.  It's
432 	 * pretty close.
433 	 */
434 	ccg->ccg_secsize = DEV_BSIZE;
435 	ccg->ccg_ntracks = 1;
436 	ccg->ccg_nsectors = 1024 * (1024 / ccg->ccg_secsize);
437 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
438 
439 	/*
440 	 * Create thread to handle deferred I/O.
441 	 */
442 	cs->sc_zap = false;
443 	error = kthread_create(PRI_BIO, KTHREAD_MPSAFE, NULL, ccdthread,
444 	    cs, &cs->sc_thread, "%s", cs->sc_xname);
445 	if (error) {
446 		printf("ccdinit: can't create thread: %d\n", error);
447 		goto out;
448 	}
449 
450 	/*
451 	 * Only now that everything is set up can we enable the device.
452 	 */
453 	mutex_enter(cs->sc_iolock);
454 	cs->sc_flags |= CCDF_INITED;
455 	mutex_exit(cs->sc_iolock);
456 	kmem_free(tmppath, MAXPATHLEN);
457 	return (0);
458 
459  out:
460 	for (ix = 0; ix < path_alloced; ix++) {
461 		kmem_free(cs->sc_cinfo[ix].ci_path,
462 		    cs->sc_cinfo[ix].ci_pathlen);
463 	}
464 	kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo));
465 	kmem_free(tmppath, MAXPATHLEN);
466 	return (error);
467 }
468 
469 static void
470 ccdinterleave(struct ccd_softc *cs)
471 {
472 	struct ccdcinfo *ci, *smallci;
473 	struct ccdiinfo *ii;
474 	daddr_t bn, lbn;
475 	int ix;
476 	u_long size;
477 
478 #ifdef DEBUG
479 	if (ccddebug & CCDB_INIT)
480 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
481 #endif
482 	/*
483 	 * Allocate an interleave table.
484 	 * Chances are this is too big, but we don't care.
485 	 */
486 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
487 	cs->sc_itable = kmem_zalloc(size, KM_SLEEP);
488 
489 	/*
490 	 * Trivial case: no interleave (actually interleave of disk size).
491 	 * Each table entry represents a single component in its entirety.
492 	 */
493 	if (cs->sc_ileave == 0) {
494 		bn = 0;
495 		ii = cs->sc_itable;
496 
497 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
498 			/* Allocate space for ii_index. */
499 			ii->ii_indexsz = sizeof(int);
500 			ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
501 			ii->ii_ndisk = 1;
502 			ii->ii_startblk = bn;
503 			ii->ii_startoff = 0;
504 			ii->ii_index[0] = ix;
505 			bn += cs->sc_cinfo[ix].ci_size;
506 			ii++;
507 		}
508 		ii->ii_ndisk = 0;
509 #ifdef DEBUG
510 		if (ccddebug & CCDB_INIT)
511 			printiinfo(cs->sc_itable);
512 #endif
513 		return;
514 	}
515 
516 	/*
517 	 * The following isn't fast or pretty; it doesn't have to be.
518 	 */
519 	size = 0;
520 	bn = lbn = 0;
521 	for (ii = cs->sc_itable; ; ii++) {
522 		/* Allocate space for ii_index. */
523 		ii->ii_indexsz = sizeof(int) * cs->sc_nccdisks;
524 		ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
525 
526 		/*
527 		 * Locate the smallest of the remaining components
528 		 */
529 		smallci = NULL;
530 		for (ci = cs->sc_cinfo;
531 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
532 			if (ci->ci_size > size &&
533 			    (smallci == NULL ||
534 			     ci->ci_size < smallci->ci_size))
535 				smallci = ci;
536 
537 		/*
538 		 * Nobody left, all done
539 		 */
540 		if (smallci == NULL) {
541 			ii->ii_ndisk = 0;
542 			break;
543 		}
544 
545 		/*
546 		 * Record starting logical block and component offset
547 		 */
548 		ii->ii_startblk = bn / cs->sc_ileave;
549 		ii->ii_startoff = lbn;
550 
551 		/*
552 		 * Determine how many disks take part in this interleave
553 		 * and record their indices.
554 		 */
555 		ix = 0;
556 		for (ci = cs->sc_cinfo;
557 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
558 			if (ci->ci_size >= smallci->ci_size)
559 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
560 		ii->ii_ndisk = ix;
561 		bn += ix * (smallci->ci_size - size);
562 		lbn = smallci->ci_size / cs->sc_ileave;
563 		size = smallci->ci_size;
564 	}
565 #ifdef DEBUG
566 	if (ccddebug & CCDB_INIT)
567 		printiinfo(cs->sc_itable);
568 #endif
569 }
570 
571 /* ARGSUSED */
572 static int
573 ccdopen(dev_t dev, int flags, int fmt, struct lwp *l)
574 {
575 	int unit = ccdunit(dev);
576 	struct ccd_softc *cs;
577 	struct disklabel *lp;
578 	int error = 0, part, pmask;
579 
580 #ifdef DEBUG
581 	if (ccddebug & CCDB_FOLLOW)
582 		printf("ccdopen(0x%"PRIx64", 0x%x)\n", dev, flags);
583 #endif
584 	if ((cs = ccdget(unit)) == NULL)
585 		return ENXIO;
586 
587 	mutex_enter(&cs->sc_dvlock);
588 
589 	lp = cs->sc_dkdev.dk_label;
590 
591 	part = DISKPART(dev);
592 	pmask = (1 << part);
593 
594 	/*
595 	 * If we're initialized, check to see if there are any other
596 	 * open partitions.  If not, then it's safe to update
597 	 * the in-core disklabel.  Only read the disklabel if it is
598 	 * not already valid.
599 	 */
600 	if ((cs->sc_flags & (CCDF_INITED|CCDF_VLABEL)) == CCDF_INITED &&
601 	    cs->sc_dkdev.dk_openmask == 0)
602 		ccdgetdisklabel(dev);
603 
604 	/* Check that the partition exists. */
605 	if (part != RAW_PART) {
606 		if (((cs->sc_flags & CCDF_INITED) == 0) ||
607 		    ((part >= lp->d_npartitions) ||
608 		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
609 			error = ENXIO;
610 			goto done;
611 		}
612 	}
613 
614 	/* Prevent our unit from being unconfigured while open. */
615 	switch (fmt) {
616 	case S_IFCHR:
617 		cs->sc_dkdev.dk_copenmask |= pmask;
618 		break;
619 
620 	case S_IFBLK:
621 		cs->sc_dkdev.dk_bopenmask |= pmask;
622 		break;
623 	}
624 	cs->sc_dkdev.dk_openmask =
625 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
626 
627  done:
628 	mutex_exit(&cs->sc_dvlock);
629 	return (error);
630 }
631 
632 /* ARGSUSED */
633 static int
634 ccdclose(dev_t dev, int flags, int fmt, struct lwp *l)
635 {
636 	int unit = ccdunit(dev);
637 	struct ccd_softc *cs;
638 	int part;
639 
640 #ifdef DEBUG
641 	if (ccddebug & CCDB_FOLLOW)
642 		printf("ccdclose(0x%"PRIx64", 0x%x)\n", dev, flags);
643 #endif
644 
645 	if ((cs = ccdget(unit)) == NULL)
646 		return ENXIO;
647 
648 	mutex_enter(&cs->sc_dvlock);
649 
650 	part = DISKPART(dev);
651 
652 	/* ...that much closer to allowing unconfiguration... */
653 	switch (fmt) {
654 	case S_IFCHR:
655 		cs->sc_dkdev.dk_copenmask &= ~(1 << part);
656 		break;
657 
658 	case S_IFBLK:
659 		cs->sc_dkdev.dk_bopenmask &= ~(1 << part);
660 		break;
661 	}
662 	cs->sc_dkdev.dk_openmask =
663 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
664 
665 	if (cs->sc_dkdev.dk_openmask == 0) {
666 		if ((cs->sc_flags & CCDF_KLABEL) == 0)
667 			cs->sc_flags &= ~CCDF_VLABEL;
668 	}
669 
670 	mutex_exit(&cs->sc_dvlock);
671 	return (0);
672 }
673 
674 static bool
675 ccdbackoff(struct ccd_softc *cs)
676 {
677 
678 	/* XXX Arbitrary, should be a uvm call. */
679 	return uvmexp.free < (uvmexp.freemin >> 1) &&
680 	    disk_isbusy(&cs->sc_dkdev);
681 }
682 
683 static void
684 ccdthread(void *cookie)
685 {
686 	struct ccd_softc *cs;
687 
688 	cs = cookie;
689 
690 #ifdef DEBUG
691  	if (ccddebug & CCDB_FOLLOW)
692  		printf("ccdthread: hello\n");
693 #endif
694 
695 	mutex_enter(cs->sc_iolock);
696 	while (__predict_true(!cs->sc_zap)) {
697 		if (bufq_peek(cs->sc_bufq) == NULL) {
698 			/* Nothing to do. */
699 			cv_wait(&cs->sc_push, cs->sc_iolock);
700 			continue;
701 		}
702 		if (ccdbackoff(cs)) {
703 			/* Wait for memory to become available. */
704 			(void)cv_timedwait(&cs->sc_push, cs->sc_iolock, 1);
705 			continue;
706 		}
707 #ifdef DEBUG
708  		if (ccddebug & CCDB_FOLLOW)
709  			printf("ccdthread: dispatching I/O\n");
710 #endif
711 		ccdstart(cs);
712 		mutex_enter(cs->sc_iolock);
713 	}
714 	cs->sc_thread = NULL;
715 	mutex_exit(cs->sc_iolock);
716 #ifdef DEBUG
717  	if (ccddebug & CCDB_FOLLOW)
718  		printf("ccdthread: goodbye\n");
719 #endif
720 	kthread_exit(0);
721 }
722 
723 static void
724 ccdstrategy(struct buf *bp)
725 {
726 	int unit = ccdunit(bp->b_dev);
727 	struct ccd_softc *cs;
728 	if ((cs = ccdget(unit)) == NULL)
729 		return;
730 
731 	/* Must be open or reading label. */
732 	KASSERT(cs->sc_dkdev.dk_openmask != 0 ||
733 	    (cs->sc_flags & CCDF_RLABEL) != 0);
734 
735 	mutex_enter(cs->sc_iolock);
736 	/* Synchronize with device init/uninit. */
737 	if (__predict_false((cs->sc_flags & CCDF_INITED) == 0)) {
738 		mutex_exit(cs->sc_iolock);
739 #ifdef DEBUG
740  		if (ccddebug & CCDB_FOLLOW)
741  			printf("ccdstrategy: unit %d: not inited\n", unit);
742 #endif
743  		bp->b_error = ENXIO;
744  		bp->b_resid = bp->b_bcount;
745  		biodone(bp);
746 		return;
747 	}
748 
749 	/* Defer to thread if system is low on memory. */
750 	bufq_put(cs->sc_bufq, bp);
751 	if (__predict_false(ccdbackoff(cs))) {
752 		mutex_exit(cs->sc_iolock);
753 #ifdef DEBUG
754  		if (ccddebug & CCDB_FOLLOW)
755  			printf("ccdstrategy: holding off on I/O\n");
756 #endif
757 		return;
758 	}
759 	ccdstart(cs);
760 }
761 
762 static void
763 ccdstart(struct ccd_softc *cs)
764 {
765 	daddr_t blkno;
766 	int wlabel;
767 	struct disklabel *lp;
768 	long bcount, rcount;
769 	struct ccdbuf *cbp;
770 	char *addr;
771 	daddr_t bn;
772 	vnode_t *vp;
773 	buf_t *bp;
774 
775 	KASSERT(mutex_owned(cs->sc_iolock));
776 
777 	disk_busy(&cs->sc_dkdev);
778 	bp = bufq_get(cs->sc_bufq);
779 	KASSERT(bp != NULL);
780 
781 #ifdef DEBUG
782 	if (ccddebug & CCDB_FOLLOW)
783 		printf("ccdstart(%s, %p)\n", cs->sc_xname, bp);
784 #endif
785 
786 	/* If it's a nil transfer, wake up the top half now. */
787 	if (bp->b_bcount == 0)
788 		goto done;
789 
790 	lp = cs->sc_dkdev.dk_label;
791 
792 	/*
793 	 * Do bounds checking and adjust transfer.  If there's an
794 	 * error, the bounds check will flag that for us.  Convert
795 	 * the partition relative block number to an absolute.
796 	 */
797 	blkno = bp->b_blkno;
798 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
799 	if (DISKPART(bp->b_dev) != RAW_PART) {
800 		if (bounds_check_with_label(&cs->sc_dkdev, bp, wlabel) <= 0)
801 			goto done;
802 		blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset;
803 	}
804 	mutex_exit(cs->sc_iolock);
805 	bp->b_rawblkno = blkno;
806 
807 	/* Allocate the component buffers and start I/O! */
808 	bp->b_resid = bp->b_bcount;
809 	bn = bp->b_rawblkno;
810 	addr = bp->b_data;
811 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
812 		cbp = ccdbuffer(cs, bp, bn, addr, bcount);
813 		rcount = cbp->cb_buf.b_bcount;
814 		bn += btodb(rcount);
815 		addr += rcount;
816 		vp = cbp->cb_buf.b_vp;
817 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
818 			mutex_enter(vp->v_interlock);
819 			vp->v_numoutput++;
820 			mutex_exit(vp->v_interlock);
821 		}
822 		(void)VOP_STRATEGY(vp, &cbp->cb_buf);
823 	}
824 	return;
825 
826  done:
827 	disk_unbusy(&cs->sc_dkdev, 0, 0);
828 	cv_broadcast(&cs->sc_stop);
829 	cv_broadcast(&cs->sc_push);
830 	mutex_exit(cs->sc_iolock);
831 	bp->b_resid = bp->b_bcount;
832 	biodone(bp);
833 }
834 
835 /*
836  * Build a component buffer header.
837  */
838 static struct ccdbuf *
839 ccdbuffer(struct ccd_softc *cs, struct buf *bp, daddr_t bn, void *addr,
840     long bcount)
841 {
842 	struct ccdcinfo *ci;
843 	struct ccdbuf *cbp;
844 	daddr_t cbn, cboff;
845 	u_int64_t cbc;
846 	int ccdisk;
847 
848 #ifdef DEBUG
849 	if (ccddebug & CCDB_IO)
850 		printf("ccdbuffer(%p, %p, %" PRId64 ", %p, %ld)\n",
851 		       cs, bp, bn, addr, bcount);
852 #endif
853 	/*
854 	 * Determine which component bn falls in.
855 	 */
856 	cbn = bn;
857 	cboff = 0;
858 
859 	/*
860 	 * Serially concatenated
861 	 */
862 	if (cs->sc_ileave == 0) {
863 		daddr_t sblk;
864 
865 		sblk = 0;
866 		for (ccdisk = 0, ci = &cs->sc_cinfo[ccdisk];
867 		    cbn >= sblk + ci->ci_size;
868 		    ccdisk++, ci = &cs->sc_cinfo[ccdisk])
869 			sblk += ci->ci_size;
870 		cbn -= sblk;
871 	}
872 	/*
873 	 * Interleaved
874 	 */
875 	else {
876 		struct ccdiinfo *ii;
877 		int off;
878 
879 		cboff = cbn % cs->sc_ileave;
880 		cbn /= cs->sc_ileave;
881 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++)
882 			if (ii->ii_startblk > cbn)
883 				break;
884 		ii--;
885 		off = cbn - ii->ii_startblk;
886 		if (ii->ii_ndisk == 1) {
887 			ccdisk = ii->ii_index[0];
888 			cbn = ii->ii_startoff + off;
889 		} else {
890 			ccdisk = ii->ii_index[off % ii->ii_ndisk];
891 			cbn = ii->ii_startoff + off / ii->ii_ndisk;
892 		}
893 		cbn *= cs->sc_ileave;
894 		ci = &cs->sc_cinfo[ccdisk];
895 	}
896 
897 	/*
898 	 * Fill in the component buf structure.
899 	 */
900 	cbp = CCD_GETBUF();
901 	KASSERT(cbp != NULL);
902 	buf_init(&cbp->cb_buf);
903 	cbp->cb_buf.b_flags = bp->b_flags;
904 	cbp->cb_buf.b_oflags = bp->b_oflags;
905 	cbp->cb_buf.b_cflags = bp->b_cflags;
906 	cbp->cb_buf.b_iodone = ccdiodone;
907 	cbp->cb_buf.b_proc = bp->b_proc;
908 	cbp->cb_buf.b_dev = ci->ci_dev;
909 	cbp->cb_buf.b_blkno = cbn + cboff;
910 	cbp->cb_buf.b_data = addr;
911 	cbp->cb_buf.b_vp = ci->ci_vp;
912 	cbp->cb_buf.b_objlock = ci->ci_vp->v_interlock;
913 	if (cs->sc_ileave == 0)
914 		cbc = dbtob((u_int64_t)(ci->ci_size - cbn));
915 	else
916 		cbc = dbtob((u_int64_t)(cs->sc_ileave - cboff));
917 	cbp->cb_buf.b_bcount = cbc < bcount ? cbc : bcount;
918 
919 	/*
920 	 * context for ccdiodone
921 	 */
922 	cbp->cb_obp = bp;
923 	cbp->cb_sc = cs;
924 	cbp->cb_comp = ccdisk;
925 
926 	BIO_COPYPRIO(&cbp->cb_buf, bp);
927 
928 #ifdef DEBUG
929 	if (ccddebug & CCDB_IO)
930 		printf(" dev 0x%"PRIx64"(u%lu): cbp %p bn %" PRId64 " addr %p"
931 		       " bcnt %d\n",
932 		    ci->ci_dev, (unsigned long) (ci-cs->sc_cinfo), cbp,
933 		    cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
934 		    cbp->cb_buf.b_bcount);
935 #endif
936 
937 	return (cbp);
938 }
939 
940 /*
941  * Called at interrupt time.
942  * Mark the component as done and if all components are done,
943  * take a ccd interrupt.
944  */
945 static void
946 ccdiodone(struct buf *vbp)
947 {
948 	struct ccdbuf *cbp = (struct ccdbuf *) vbp;
949 	struct buf *bp = cbp->cb_obp;
950 	struct ccd_softc *cs = cbp->cb_sc;
951 	int count;
952 
953 #ifdef DEBUG
954 	if (ccddebug & CCDB_FOLLOW)
955 		printf("ccdiodone(%p)\n", cbp);
956 	if (ccddebug & CCDB_IO) {
957 		printf("ccdiodone: bp %p bcount %d resid %d\n",
958 		       bp, bp->b_bcount, bp->b_resid);
959 		printf(" dev 0x%"PRIx64"(u%d), cbp %p bn %" PRId64 " addr %p"
960 		       " bcnt %d\n",
961 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
962 		       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
963 		       cbp->cb_buf.b_bcount);
964 	}
965 #endif
966 
967 	if (cbp->cb_buf.b_error != 0) {
968 		bp->b_error = cbp->cb_buf.b_error;
969 		printf("%s: error %d on component %d\n",
970 		       cs->sc_xname, bp->b_error, cbp->cb_comp);
971 	}
972 	count = cbp->cb_buf.b_bcount;
973 	buf_destroy(&cbp->cb_buf);
974 	CCD_PUTBUF(cbp);
975 
976 	/*
977 	 * If all done, "interrupt".
978 	 */
979 	mutex_enter(cs->sc_iolock);
980 	bp->b_resid -= count;
981 	if (bp->b_resid < 0)
982 		panic("ccdiodone: count");
983 	if (bp->b_resid == 0) {
984 		/*
985 		 * Request is done for better or worse, wakeup the top half.
986 		 */
987 		if (bp->b_error != 0)
988 			bp->b_resid = bp->b_bcount;
989 		disk_unbusy(&cs->sc_dkdev, (bp->b_bcount - bp->b_resid),
990 		    (bp->b_flags & B_READ));
991 		if (!disk_isbusy(&cs->sc_dkdev)) {
992 			if (bufq_peek(cs->sc_bufq) != NULL) {
993 				cv_broadcast(&cs->sc_push);
994 			}
995 			cv_broadcast(&cs->sc_stop);
996 		}
997 		mutex_exit(cs->sc_iolock);
998 		biodone(bp);
999 	} else
1000 		mutex_exit(cs->sc_iolock);
1001 }
1002 
1003 /* ARGSUSED */
1004 static int
1005 ccdread(dev_t dev, struct uio *uio, int flags)
1006 {
1007 	int unit = ccdunit(dev);
1008 	struct ccd_softc *cs;
1009 
1010 #ifdef DEBUG
1011 	if (ccddebug & CCDB_FOLLOW)
1012 		printf("ccdread(0x%"PRIx64", %p)\n", dev, uio);
1013 #endif
1014 	if ((cs = ccdget(unit)) == NULL)
1015 		return 0;
1016 
1017 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1018 	if ((cs->sc_flags & CCDF_INITED) == 0)
1019 		return (ENXIO);
1020 
1021 	return (physio(ccdstrategy, NULL, dev, B_READ, minphys, uio));
1022 }
1023 
1024 /* ARGSUSED */
1025 static int
1026 ccdwrite(dev_t dev, struct uio *uio, int flags)
1027 {
1028 	int unit = ccdunit(dev);
1029 	struct ccd_softc *cs;
1030 
1031 #ifdef DEBUG
1032 	if (ccddebug & CCDB_FOLLOW)
1033 		printf("ccdwrite(0x%"PRIx64", %p)\n", dev, uio);
1034 #endif
1035 	if ((cs = ccdget(unit)) == NULL)
1036 		return ENOENT;
1037 
1038 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1039 	if ((cs->sc_flags & CCDF_INITED) == 0)
1040 		return (ENXIO);
1041 
1042 	return (physio(ccdstrategy, NULL, dev, B_WRITE, minphys, uio));
1043 }
1044 
1045 static int
1046 ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1047 {
1048 	int unit = ccdunit(dev);
1049 	int i, j, lookedup = 0, error = 0;
1050 	int part, pmask;
1051 	struct ccd_softc *cs;
1052 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1053 	kauth_cred_t uc;
1054 	char **cpp;
1055 	struct pathbuf *pb;
1056 	struct vnode **vpp;
1057 #ifdef __HAVE_OLD_DISKLABEL
1058 	struct disklabel newlabel;
1059 #endif
1060 
1061 	if ((cs = ccdget(unit)) == NULL)
1062 		return ENOENT;
1063 	uc = kauth_cred_get();
1064 
1065 	/* Must be open for writes for these commands... */
1066 	switch (cmd) {
1067 	case CCDIOCSET:
1068 	case CCDIOCCLR:
1069 	case DIOCSDINFO:
1070 	case DIOCWDINFO:
1071 #ifdef __HAVE_OLD_DISKLABEL
1072 	case ODIOCSDINFO:
1073 	case ODIOCWDINFO:
1074 #endif
1075 	case DIOCKLABEL:
1076 	case DIOCWLABEL:
1077 		if ((flag & FWRITE) == 0)
1078 			return (EBADF);
1079 	}
1080 
1081 	mutex_enter(&cs->sc_dvlock);
1082 
1083 	/* Must be initialized for these... */
1084 	switch (cmd) {
1085 	case CCDIOCCLR:
1086 	case DIOCGDINFO:
1087 	case DIOCCACHESYNC:
1088 	case DIOCSDINFO:
1089 	case DIOCWDINFO:
1090 	case DIOCGPART:
1091 	case DIOCWLABEL:
1092 	case DIOCKLABEL:
1093 	case DIOCGDEFLABEL:
1094 #ifdef __HAVE_OLD_DISKLABEL
1095 	case ODIOCGDINFO:
1096 	case ODIOCSDINFO:
1097 	case ODIOCWDINFO:
1098 	case ODIOCGDEFLABEL:
1099 #endif
1100 		if ((cs->sc_flags & CCDF_INITED) == 0) {
1101 			error = ENXIO;
1102 			goto out;
1103 		}
1104 	}
1105 
1106 	switch (cmd) {
1107 	case CCDIOCSET:
1108 		if (cs->sc_flags & CCDF_INITED) {
1109 			error = EBUSY;
1110 			goto out;
1111 		}
1112 
1113 		/* Validate the flags. */
1114 		if ((ccio->ccio_flags & CCDF_USERMASK) != ccio->ccio_flags) {
1115 			error = EINVAL;
1116 			goto out;
1117 		}
1118 
1119 		if (ccio->ccio_ndisks > CCD_MAXNDISKS ||
1120 		    ccio->ccio_ndisks == 0) {
1121 			error = EINVAL;
1122 			goto out;
1123 		}
1124 
1125 		/* Fill in some important bits. */
1126 		cs->sc_ileave = ccio->ccio_ileave;
1127 		cs->sc_nccdisks = ccio->ccio_ndisks;
1128 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1129 
1130 		/*
1131 		 * Allocate space for and copy in the array of
1132 		 * componet pathnames and device numbers.
1133 		 */
1134 		cpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*cpp), KM_SLEEP);
1135 		vpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*vpp), KM_SLEEP);
1136 		error = copyin(ccio->ccio_disks, cpp,
1137 		    ccio->ccio_ndisks * sizeof(*cpp));
1138 		if (error) {
1139 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1140 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1141 			goto out;
1142 		}
1143 
1144 #ifdef DEBUG
1145 		if (ccddebug & CCDB_INIT)
1146 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1147 				printf("ccdioctl: component %d: %p\n",
1148 				    i, cpp[i]);
1149 #endif
1150 
1151 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1152 #ifdef DEBUG
1153 			if (ccddebug & CCDB_INIT)
1154 				printf("ccdioctl: lookedup = %d\n", lookedup);
1155 #endif
1156 			error = pathbuf_copyin(cpp[i], &pb);
1157 			if (error == 0) {
1158 				error = dk_lookup(pb, l, &vpp[i]);
1159 			}
1160 			pathbuf_destroy(pb);
1161 			if (error != 0) {
1162 				for (j = 0; j < lookedup; ++j)
1163 					(void)vn_close(vpp[j], FREAD|FWRITE,
1164 					    uc);
1165 				kmem_free(vpp, ccio->ccio_ndisks *
1166 				    sizeof(*vpp));
1167 				kmem_free(cpp, ccio->ccio_ndisks *
1168 				    sizeof(*cpp));
1169 				goto out;
1170 			}
1171 			++lookedup;
1172 		}
1173 
1174 		/* Attach the disk. */
1175 		disk_attach(&cs->sc_dkdev);
1176 		bufq_alloc(&cs->sc_bufq, "fcfs", 0);
1177 
1178 		/*
1179 		 * Initialize the ccd.  Fills in the softc for us.
1180 		 */
1181 		if ((error = ccdinit(cs, cpp, vpp, l)) != 0) {
1182 			for (j = 0; j < lookedup; ++j)
1183 				(void)vn_close(vpp[j], FREAD|FWRITE,
1184 				    uc);
1185 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1186 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1187 			disk_detach(&cs->sc_dkdev);
1188 			bufq_free(cs->sc_bufq);
1189 			goto out;
1190 		}
1191 
1192 		/* We can free the temporary variables now. */
1193 		kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1194 		kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1195 
1196 		/*
1197 		 * The ccd has been successfully initialized, so
1198 		 * we can place it into the array.  Don't try to
1199 		 * read the disklabel until the disk has been attached,
1200 		 * because space for the disklabel is allocated
1201 		 * in disk_attach();
1202 		 */
1203 		ccio->ccio_unit = unit;
1204 		ccio->ccio_size = cs->sc_size;
1205 
1206 		/* Try and read the disklabel. */
1207 		ccdgetdisklabel(dev);
1208 		break;
1209 
1210 	case CCDIOCCLR:
1211 		/*
1212 		 * Don't unconfigure if any other partitions are open
1213 		 * or if both the character and block flavors of this
1214 		 * partition are open.
1215 		 */
1216 		part = DISKPART(dev);
1217 		pmask = (1 << part);
1218 		if ((cs->sc_dkdev.dk_openmask & ~pmask) ||
1219 		    ((cs->sc_dkdev.dk_bopenmask & pmask) &&
1220 		    (cs->sc_dkdev.dk_copenmask & pmask))) {
1221 			error = EBUSY;
1222 			goto out;
1223 		}
1224 
1225 		/* Stop new I/O, wait for in-flight I/O to complete. */
1226 		mutex_enter(cs->sc_iolock);
1227 		cs->sc_flags &= ~(CCDF_INITED|CCDF_VLABEL);
1228 		cs->sc_zap = true;
1229 		while (disk_isbusy(&cs->sc_dkdev) ||
1230 		    bufq_peek(cs->sc_bufq) != NULL ||
1231 		    cs->sc_thread != NULL) {
1232 			cv_broadcast(&cs->sc_push);
1233 			(void)cv_timedwait(&cs->sc_stop, cs->sc_iolock, hz);
1234 		}
1235 		mutex_exit(cs->sc_iolock);
1236 
1237 		/*
1238 		 * Free ccd_softc information and clear entry.
1239 		 */
1240 
1241 		/* Close the components and free their pathnames. */
1242 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1243 			/*
1244 			 * XXX: this close could potentially fail and
1245 			 * cause Bad Things.  Maybe we need to force
1246 			 * the close to happen?
1247 			 */
1248 #ifdef DEBUG
1249 			if (ccddebug & CCDB_VNODE)
1250 				vprint("CCDIOCCLR: vnode info",
1251 				    cs->sc_cinfo[i].ci_vp);
1252 #endif
1253 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1254 			    uc);
1255 			kmem_free(cs->sc_cinfo[i].ci_path,
1256 			    cs->sc_cinfo[i].ci_pathlen);
1257 		}
1258 
1259 		/* Free interleave index. */
1260 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) {
1261 			kmem_free(cs->sc_itable[i].ii_index,
1262 			    cs->sc_itable[i].ii_indexsz);
1263 		}
1264 
1265 		/* Free component info and interleave table. */
1266 		kmem_free(cs->sc_cinfo, cs->sc_nccdisks *
1267 		    sizeof(struct ccdcinfo));
1268 		kmem_free(cs->sc_itable, (cs->sc_nccdisks + 1) *
1269 		    sizeof(struct ccdiinfo));
1270 
1271 		/* Detatch the disk. */
1272 		disk_detach(&cs->sc_dkdev);
1273 		bufq_free(cs->sc_bufq);
1274 		ccdput(cs);
1275 		/* Don't break, otherwise cs is read again. */
1276 		return 0;
1277 
1278 	case DIOCGDINFO:
1279 		*(struct disklabel *)data = *(cs->sc_dkdev.dk_label);
1280 		break;
1281 
1282 #ifdef __HAVE_OLD_DISKLABEL
1283 	case ODIOCGDINFO:
1284 		newlabel = *(cs->sc_dkdev.dk_label);
1285 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1286 			return ENOTTY;
1287 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1288 		break;
1289 #endif
1290 
1291 	case DIOCGPART:
1292 		((struct partinfo *)data)->disklab = cs->sc_dkdev.dk_label;
1293 		((struct partinfo *)data)->part =
1294 		    &cs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1295 		break;
1296 
1297 	case DIOCCACHESYNC:
1298 		/*
1299 		 * XXX Do we really need to care about having a writable
1300 		 * file descriptor here?
1301 		 */
1302 		if ((flag & FWRITE) == 0)
1303 			return (EBADF);
1304 
1305 		/*
1306 		 * We pass this call down to all components and report
1307 		 * the first error we encounter.
1308 		 */
1309 		for (error = 0, i = 0; i < cs->sc_nccdisks; i++) {
1310 			j = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, data,
1311 				      flag, uc);
1312 			if (j != 0 && error == 0)
1313 				error = j;
1314 		}
1315 		break;
1316 
1317 	case DIOCWDINFO:
1318 	case DIOCSDINFO:
1319 #ifdef __HAVE_OLD_DISKLABEL
1320 	case ODIOCWDINFO:
1321 	case ODIOCSDINFO:
1322 #endif
1323 	{
1324 		struct disklabel *lp;
1325 #ifdef __HAVE_OLD_DISKLABEL
1326 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1327 			memset(&newlabel, 0, sizeof newlabel);
1328 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1329 			lp = &newlabel;
1330 		} else
1331 #endif
1332 		lp = (struct disklabel *)data;
1333 
1334 		cs->sc_flags |= CCDF_LABELLING;
1335 
1336 		error = setdisklabel(cs->sc_dkdev.dk_label,
1337 		    lp, 0, cs->sc_dkdev.dk_cpulabel);
1338 		if (error == 0) {
1339 			if (cmd == DIOCWDINFO
1340 #ifdef __HAVE_OLD_DISKLABEL
1341 			    || cmd == ODIOCWDINFO
1342 #endif
1343 			   )
1344 				error = writedisklabel(CCDLABELDEV(dev),
1345 				    ccdstrategy, cs->sc_dkdev.dk_label,
1346 				    cs->sc_dkdev.dk_cpulabel);
1347 		}
1348 
1349 		cs->sc_flags &= ~CCDF_LABELLING;
1350 		break;
1351 	}
1352 
1353 	case DIOCKLABEL:
1354 		if (*(int *)data != 0)
1355 			cs->sc_flags |= CCDF_KLABEL;
1356 		else
1357 			cs->sc_flags &= ~CCDF_KLABEL;
1358 		break;
1359 
1360 	case DIOCWLABEL:
1361 		if (*(int *)data != 0)
1362 			cs->sc_flags |= CCDF_WLABEL;
1363 		else
1364 			cs->sc_flags &= ~CCDF_WLABEL;
1365 		break;
1366 
1367 	case DIOCGDEFLABEL:
1368 		ccdgetdefaultlabel(cs, (struct disklabel *)data);
1369 		break;
1370 
1371 #ifdef __HAVE_OLD_DISKLABEL
1372 	case ODIOCGDEFLABEL:
1373 		ccdgetdefaultlabel(cs, &newlabel);
1374 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1375 			return ENOTTY;
1376 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1377 		break;
1378 #endif
1379 
1380 	default:
1381 		error = ENOTTY;
1382 	}
1383 
1384  out:
1385 	mutex_exit(&cs->sc_dvlock);
1386 	return (error);
1387 }
1388 
1389 static int
1390 ccdsize(dev_t dev)
1391 {
1392 	struct ccd_softc *cs;
1393 	struct disklabel *lp;
1394 	int part, unit, omask, size;
1395 
1396 	unit = ccdunit(dev);
1397 	if ((cs = ccdget(unit)) == NULL)
1398 		return -1;
1399 
1400 	if ((cs->sc_flags & CCDF_INITED) == 0)
1401 		return (-1);
1402 
1403 	part = DISKPART(dev);
1404 	omask = cs->sc_dkdev.dk_openmask & (1 << part);
1405 	lp = cs->sc_dkdev.dk_label;
1406 
1407 	if (omask == 0 && ccdopen(dev, 0, S_IFBLK, curlwp))
1408 		return (-1);
1409 
1410 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1411 		size = -1;
1412 	else
1413 		size = lp->d_partitions[part].p_size *
1414 		    (lp->d_secsize / DEV_BSIZE);
1415 
1416 	if (omask == 0 && ccdclose(dev, 0, S_IFBLK, curlwp))
1417 		return (-1);
1418 
1419 	return (size);
1420 }
1421 
1422 static void
1423 ccdgetdefaultlabel(struct ccd_softc *cs, struct disklabel *lp)
1424 {
1425 	struct ccdgeom *ccg = &cs->sc_geom;
1426 
1427 	memset(lp, 0, sizeof(*lp));
1428 
1429 	lp->d_secperunit = cs->sc_size;
1430 	lp->d_secsize = ccg->ccg_secsize;
1431 	lp->d_nsectors = ccg->ccg_nsectors;
1432 	lp->d_ntracks = ccg->ccg_ntracks;
1433 	lp->d_ncylinders = ccg->ccg_ncylinders;
1434 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1435 
1436 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1437 	lp->d_type = DTYPE_CCD;
1438 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1439 	lp->d_rpm = 3600;
1440 	lp->d_interleave = 1;
1441 	lp->d_flags = 0;
1442 
1443 	lp->d_partitions[RAW_PART].p_offset = 0;
1444 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1445 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1446 	lp->d_npartitions = RAW_PART + 1;
1447 
1448 	lp->d_magic = DISKMAGIC;
1449 	lp->d_magic2 = DISKMAGIC;
1450 	lp->d_checksum = dkcksum(cs->sc_dkdev.dk_label);
1451 }
1452 
1453 /*
1454  * Read the disklabel from the ccd.  If one is not present, fake one
1455  * up.
1456  */
1457 static void
1458 ccdgetdisklabel(dev_t dev)
1459 {
1460 	int unit = ccdunit(dev);
1461 	struct ccd_softc *cs;
1462 	const char *errstring;
1463 	struct disklabel *lp;
1464 	struct cpu_disklabel *clp;
1465 
1466 	if ((cs = ccdget(unit)) == NULL)
1467 		return;
1468 	lp = cs->sc_dkdev.dk_label;
1469 	clp = cs->sc_dkdev.dk_cpulabel;
1470 	KASSERT(mutex_owned(&cs->sc_dvlock));
1471 
1472 	memset(clp, 0, sizeof(*clp));
1473 
1474 	ccdgetdefaultlabel(cs, lp);
1475 
1476 	/*
1477 	 * Call the generic disklabel extraction routine.
1478 	 */
1479 	cs->sc_flags |= CCDF_RLABEL;
1480 	if ((cs->sc_flags & CCDF_NOLABEL) != 0)
1481 		errstring = "CCDF_NOLABEL set; ignoring on-disk label";
1482 	else
1483 		errstring = readdisklabel(CCDLABELDEV(dev), ccdstrategy,
1484 		    cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel);
1485 	if (errstring)
1486 		ccdmakedisklabel(cs);
1487 	else {
1488 		int i;
1489 		struct partition *pp;
1490 
1491 		/*
1492 		 * Sanity check whether the found disklabel is valid.
1493 		 *
1494 		 * This is necessary since total size of ccd may vary
1495 		 * when an interleave is changed even though exactly
1496 		 * same componets are used, and old disklabel may used
1497 		 * if that is found.
1498 		 */
1499 		if (lp->d_secperunit != cs->sc_size)
1500 			printf("WARNING: %s: "
1501 			    "total sector size in disklabel (%d) != "
1502 			    "the size of ccd (%lu)\n", cs->sc_xname,
1503 			    lp->d_secperunit, (u_long)cs->sc_size);
1504 		for (i = 0; i < lp->d_npartitions; i++) {
1505 			pp = &lp->d_partitions[i];
1506 			if (pp->p_offset + pp->p_size > cs->sc_size)
1507 				printf("WARNING: %s: end of partition `%c' "
1508 				    "exceeds the size of ccd (%lu)\n",
1509 				    cs->sc_xname, 'a' + i, (u_long)cs->sc_size);
1510 		}
1511 	}
1512 
1513 #ifdef DEBUG
1514 	/* It's actually extremely common to have unlabeled ccds. */
1515 	if (ccddebug & CCDB_LABEL)
1516 		if (errstring != NULL)
1517 			printf("%s: %s\n", cs->sc_xname, errstring);
1518 #endif
1519 
1520 	/* In-core label now valid. */
1521 	cs->sc_flags = (cs->sc_flags | CCDF_VLABEL) & ~CCDF_RLABEL;
1522 }
1523 
1524 /*
1525  * Take care of things one might want to take care of in the event
1526  * that a disklabel isn't present.
1527  */
1528 static void
1529 ccdmakedisklabel(struct ccd_softc *cs)
1530 {
1531 	struct disklabel *lp = cs->sc_dkdev.dk_label;
1532 
1533 	/*
1534 	 * For historical reasons, if there's no disklabel present
1535 	 * the raw partition must be marked FS_BSDFFS.
1536 	 */
1537 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1538 
1539 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1540 
1541 	lp->d_checksum = dkcksum(lp);
1542 }
1543 
1544 #ifdef DEBUG
1545 static void
1546 printiinfo(struct ccdiinfo *ii)
1547 {
1548 	int ix, i;
1549 
1550 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1551 		printf(" itab[%d]: #dk %d sblk %" PRId64 " soff %" PRId64,
1552 		    ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1553 		for (i = 0; i < ii->ii_ndisk; i++)
1554 			printf(" %d", ii->ii_index[i]);
1555 		printf("\n");
1556 	}
1557 }
1558 #endif
1559 
1560 MODULE(MODULE_CLASS_DRIVER, ccd, "dk_subr");
1561 
1562 static int
1563 ccd_modcmd(modcmd_t cmd, void *arg)
1564 {
1565 	int error = 0;
1566 #ifdef _MODULE
1567 	int bmajor = -1, cmajor = -1;
1568 #endif
1569 
1570 
1571 	switch (cmd) {
1572 	case MODULE_CMD_INIT:
1573 #ifdef _MODULE
1574 		ccdattach(4);
1575 
1576 		return devsw_attach("ccd", &ccd_bdevsw, &bmajor,
1577 		    &ccd_cdevsw, &cmajor);
1578 #endif
1579 		break;
1580 
1581 	case MODULE_CMD_FINI:
1582 #ifdef _MODULE
1583 		return devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
1584 #endif
1585 		break;
1586 
1587 	case MODULE_CMD_STAT:
1588 		return ENOTTY;
1589 
1590 	default:
1591 		return ENOTTY;
1592 	}
1593 
1594 	return error;
1595 }
1596 
1597 static int
1598 ccd_units_sysctl(SYSCTLFN_ARGS)
1599 {
1600 	struct sysctlnode node;
1601 	struct ccd_softc *sc;
1602 	int error, i, nccd, *units;
1603 	size_t size;
1604 
1605 	nccd = 0;
1606 	mutex_enter(&ccd_lock);
1607 	LIST_FOREACH(sc, &ccds, sc_link)
1608 		nccd++;
1609 	mutex_exit(&ccd_lock);
1610 
1611 	if (nccd != 0) {
1612 		size = nccd * sizeof(*units);
1613 		units = kmem_zalloc(size, KM_SLEEP);
1614 		if (units == NULL)
1615 			return ENOMEM;
1616 
1617 		i = 0;
1618 		mutex_enter(&ccd_lock);
1619 		LIST_FOREACH(sc, &ccds, sc_link) {
1620 			if (i >= nccd)
1621 				break;
1622 			units[i] = sc->sc_unit;
1623 		}
1624 		mutex_exit(&ccd_lock);
1625 	} else {
1626 		units = NULL;
1627 		size = 0;
1628 	}
1629 
1630 	node = *rnode;
1631 	node.sysctl_data = units;
1632 	node.sysctl_size = size;
1633 
1634 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1635 	if (units)
1636 		kmem_free(units, size);
1637 	return error;
1638 }
1639 
1640 static int
1641 ccd_info_sysctl(SYSCTLFN_ARGS)
1642 {
1643 	struct sysctlnode node;
1644 	struct ccddiskinfo ccd;
1645 	struct ccd_softc *sc;
1646 	int unit;
1647 
1648 	if (newp == NULL || newlen != sizeof(int))
1649 		return EINVAL;
1650 
1651 	unit = *(const int *)newp;
1652 	newp = NULL;
1653 	newlen = 0;
1654 	ccd.ccd_ndisks = ~0;
1655 	mutex_enter(&ccd_lock);
1656 	LIST_FOREACH(sc, &ccds, sc_link) {
1657 		if (sc->sc_unit == unit) {
1658 			ccd.ccd_ileave = sc->sc_ileave;
1659 			ccd.ccd_size = sc->sc_size;
1660 			ccd.ccd_ndisks = sc->sc_nccdisks;
1661 			ccd.ccd_flags = sc->sc_flags;
1662 			break;
1663 		}
1664 	}
1665 	mutex_exit(&ccd_lock);
1666 
1667 	if (ccd.ccd_ndisks == ~0)
1668 		return ENOENT;
1669 
1670 	node = *rnode;
1671 	node.sysctl_data = &ccd;
1672 	node.sysctl_size = sizeof(ccd);
1673 
1674 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1675 }
1676 
1677 static int
1678 ccd_components_sysctl(SYSCTLFN_ARGS)
1679 {
1680 	struct sysctlnode node;
1681 	int error, unit;
1682 	size_t size;
1683 	char *names, *p, *ep;
1684 	struct ccd_softc *sc;
1685 
1686 	if (newp == NULL || newlen != sizeof(int))
1687 		return EINVAL;
1688 
1689 	size = 0;
1690 	unit = *(const int *)newp;
1691 	newp = NULL;
1692 	newlen = 0;
1693 	mutex_enter(&ccd_lock);
1694 	LIST_FOREACH(sc, &ccds, sc_link)
1695 		if (sc->sc_unit == unit) {
1696 			for (size_t i = 0; i < sc->sc_nccdisks; i++)
1697 				size += strlen(sc->sc_cinfo[i].ci_path) + 1;
1698 			break;
1699 		}
1700 	mutex_exit(&ccd_lock);
1701 
1702 	if (size == 0)
1703 		return ENOENT;
1704 	names = kmem_zalloc(size, KM_SLEEP);
1705 	if (names == NULL)
1706 		return ENOMEM;
1707 
1708 	p = names;
1709 	ep = names + size;
1710 	mutex_enter(&ccd_lock);
1711 	LIST_FOREACH(sc, &ccds, sc_link)
1712 		if (sc->sc_unit == unit) {
1713 			for (size_t i = 0; i < sc->sc_nccdisks; i++) {
1714 				char *d = sc->sc_cinfo[i].ci_path;
1715 				while (p < ep && (*p++ = *d++) != '\0')
1716 					continue;
1717 			}
1718 			break;
1719 		}
1720 	mutex_exit(&ccd_lock);
1721 
1722 	node = *rnode;
1723 	node.sysctl_data = names;
1724 	node.sysctl_size = ep - names;
1725 
1726 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1727 	kmem_free(names, size);
1728 	return error;
1729 }
1730 
1731 SYSCTL_SETUP(sysctl_kern_ccd_setup, "sysctl kern.ccd subtree setup")
1732 {
1733 	const struct sysctlnode *node = NULL;
1734 
1735 	sysctl_createv(clog, 0, NULL, &node,
1736 	    CTLFLAG_PERMANENT,
1737 	    CTLTYPE_NODE, "ccd",
1738 	    SYSCTL_DESCR("ConCatenated Disk state"),
1739 	    NULL, 0, NULL, 0,
1740 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1741 
1742 	if (node == NULL)
1743 		return;
1744 
1745 	sysctl_createv(clog, 0, &node, NULL,
1746 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
1747 	    CTLTYPE_STRUCT, "units",
1748 	    SYSCTL_DESCR("List of ccd unit numbers"),
1749 	    ccd_units_sysctl, 0, NULL, 0,
1750 	    CTL_CREATE, CTL_EOL);
1751 	sysctl_createv(clog, 0, &node, NULL,
1752 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1753 	    CTLTYPE_STRUCT, "info",
1754 	    SYSCTL_DESCR("Information about a CCD unit"),
1755 	    ccd_info_sysctl, 0, NULL, 0,
1756 	    CTL_CREATE, CTL_EOL);
1757 	sysctl_createv(clog, 0, &node, NULL,
1758 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1759 	    CTLTYPE_STRUCT, "components",
1760 	    SYSCTL_DESCR("Information about CCD components"),
1761 	    ccd_components_sysctl, 0, NULL, 0,
1762 	    CTL_CREATE, CTL_EOL);
1763 }
1764