xref: /netbsd-src/sys/dev/ccd.c (revision 6cf6fe02a981b55727c49c3d37b0d8191a98c0ee)
1 /*	$NetBSD: ccd.c,v 1.152 2014/08/16 19:27:27 sborrill Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe, and by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * "Concatenated" disk driver.
72  *
73  * Notes on concurrency:
74  *
75  * => sc_dvlock serializes access to the device nodes, excluding block I/O.
76  *
77  * => sc_iolock serializes access to (sc_flags & CCDF_INITED), disk stats,
78  *    sc_stop, sc_bufq and b_resid from master buffers.
79  *
80  * => a combination of CCDF_INITED, sc_inflight, and sc_iolock is used to
81  *    serialize I/O and configuration changes.
82  *
83  * => the in-core disk label does not change while the device is open.
84  *
85  * On memory consumption: ccd fans out I/O requests and so needs to
86  * allocate memory.  If the system is desperately low on memory, we
87  * single thread I/O.
88  */
89 
90 #include <sys/cdefs.h>
91 __KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.152 2014/08/16 19:27:27 sborrill Exp $");
92 
93 #if defined(_KERNEL_OPT)
94 #include "opt_compat_netbsd.h"
95 #endif
96 
97 #include <sys/param.h>
98 #include <sys/systm.h>
99 #include <sys/kernel.h>
100 #include <sys/proc.h>
101 #include <sys/errno.h>
102 #include <sys/buf.h>
103 #include <sys/kmem.h>
104 #include <sys/pool.h>
105 #include <sys/module.h>
106 #include <sys/namei.h>
107 #include <sys/stat.h>
108 #include <sys/ioctl.h>
109 #include <sys/disklabel.h>
110 #include <sys/device.h>
111 #include <sys/disk.h>
112 #include <sys/syslog.h>
113 #include <sys/fcntl.h>
114 #include <sys/vnode.h>
115 #include <sys/conf.h>
116 #include <sys/mutex.h>
117 #include <sys/queue.h>
118 #include <sys/kauth.h>
119 #include <sys/kthread.h>
120 #include <sys/bufq.h>
121 #include <sys/sysctl.h>
122 
123 #include <uvm/uvm_extern.h>
124 
125 #include <dev/ccdvar.h>
126 #include <dev/dkvar.h>
127 
128 #include <miscfs/specfs/specdev.h> /* for v_rdev */
129 
130 #if defined(CCDDEBUG) && !defined(DEBUG)
131 #define DEBUG
132 #endif
133 
134 #ifdef DEBUG
135 #define CCDB_FOLLOW	0x01
136 #define CCDB_INIT	0x02
137 #define CCDB_IO		0x04
138 #define CCDB_LABEL	0x08
139 #define CCDB_VNODE	0x10
140 int ccddebug = 0x00;
141 #endif
142 
143 #define	ccdunit(x)	DISKUNIT(x)
144 
145 struct ccdbuf {
146 	struct buf	cb_buf;		/* new I/O buf */
147 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
148 	struct ccd_softc *cb_sc;	/* pointer to ccd softc */
149 	int		cb_comp;	/* target component */
150 	SIMPLEQ_ENTRY(ccdbuf) cb_q;	/* fifo of component buffers */
151 };
152 
153 /* component buffer pool */
154 static pool_cache_t ccd_cache;
155 
156 #define	CCD_GETBUF()		pool_cache_get(ccd_cache, PR_WAITOK)
157 #define	CCD_PUTBUF(cbp)		pool_cache_put(ccd_cache, cbp)
158 
159 #define CCDLABELDEV(dev)	\
160 	(MAKEDISKDEV(major((dev)), ccdunit((dev)), RAW_PART))
161 
162 /* called by main() at boot time */
163 void	ccdattach(int);
164 
165 /* called by biodone() at interrupt time */
166 static void	ccdiodone(struct buf *);
167 
168 static void	ccdinterleave(struct ccd_softc *);
169 static int	ccdinit(struct ccd_softc *, char **, struct vnode **,
170 		    struct lwp *);
171 static struct ccdbuf *ccdbuffer(struct ccd_softc *, struct buf *,
172 		    daddr_t, void *, long);
173 static void	ccdgetdefaultlabel(struct ccd_softc *, struct disklabel *);
174 static void	ccdgetdisklabel(dev_t);
175 static void	ccdmakedisklabel(struct ccd_softc *);
176 static void	ccdstart(struct ccd_softc *);
177 static void	ccdthread(void *);
178 
179 static dev_type_open(ccdopen);
180 static dev_type_close(ccdclose);
181 static dev_type_read(ccdread);
182 static dev_type_write(ccdwrite);
183 static dev_type_ioctl(ccdioctl);
184 static dev_type_strategy(ccdstrategy);
185 static dev_type_size(ccdsize);
186 
187 const struct bdevsw ccd_bdevsw = {
188 	.d_open = ccdopen,
189 	.d_close = ccdclose,
190 	.d_strategy = ccdstrategy,
191 	.d_ioctl = ccdioctl,
192 	.d_dump = nodump,
193 	.d_psize = ccdsize,
194 	.d_discard = nodiscard,
195 	.d_flag = D_DISK | D_MPSAFE
196 };
197 
198 const struct cdevsw ccd_cdevsw = {
199 	.d_open = ccdopen,
200 	.d_close = ccdclose,
201 	.d_read = ccdread,
202 	.d_write = ccdwrite,
203 	.d_ioctl = ccdioctl,
204 	.d_stop = nostop,
205 	.d_tty = notty,
206 	.d_poll = nopoll,
207 	.d_mmap = nommap,
208 	.d_kqfilter = nokqfilter,
209 	.d_discard = nodiscard,
210 	.d_flag = D_DISK | D_MPSAFE
211 };
212 
213 #ifdef DEBUG
214 static	void printiinfo(struct ccdiinfo *);
215 #endif
216 
217 static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds);
218 static kmutex_t ccd_lock;
219 
220 static struct ccd_softc *
221 ccdcreate(int unit) {
222 	struct ccd_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
223 	if (sc == NULL) {
224 #ifdef DIAGNOSTIC
225 		printf("%s: out of memory\n", __func__);
226 #endif
227 		return NULL;
228 	}
229 	/* Initialize per-softc structures. */
230 	snprintf(sc->sc_xname, sizeof(sc->sc_xname), "ccd%d", unit);
231 	mutex_init(&sc->sc_dvlock, MUTEX_DEFAULT, IPL_NONE);
232 	sc->sc_iolock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
233 	cv_init(&sc->sc_stop, "ccdstop");
234 	cv_init(&sc->sc_push, "ccdthr");
235 	disk_init(&sc->sc_dkdev, sc->sc_xname, NULL); /* XXX */
236 	return sc;
237 }
238 
239 static void
240 ccddestroy(struct ccd_softc *sc) {
241 	mutex_obj_free(sc->sc_iolock);
242 	mutex_exit(&sc->sc_dvlock);
243 	mutex_destroy(&sc->sc_dvlock);
244 	cv_destroy(&sc->sc_stop);
245 	cv_destroy(&sc->sc_push);
246 	disk_destroy(&sc->sc_dkdev);
247 	kmem_free(sc, sizeof(*sc));
248 }
249 
250 static struct ccd_softc *
251 ccdget(int unit) {
252 	struct ccd_softc *sc;
253 	if (unit < 0) {
254 #ifdef DIAGNOSTIC
255 		panic("%s: unit %d!", __func__, unit);
256 #endif
257 		return NULL;
258 	}
259 	mutex_enter(&ccd_lock);
260 	LIST_FOREACH(sc, &ccds, sc_link) {
261 		if (sc->sc_unit == unit) {
262 			mutex_exit(&ccd_lock);
263 			return sc;
264 		}
265 	}
266 	mutex_exit(&ccd_lock);
267 	if ((sc = ccdcreate(unit)) == NULL)
268 		return NULL;
269 	mutex_enter(&ccd_lock);
270 	LIST_INSERT_HEAD(&ccds, sc, sc_link);
271 	mutex_exit(&ccd_lock);
272 	return sc;
273 }
274 
275 static void
276 ccdput(struct ccd_softc *sc) {
277 	mutex_enter(&ccd_lock);
278 	LIST_REMOVE(sc, sc_link);
279 	mutex_exit(&ccd_lock);
280 	ccddestroy(sc);
281 }
282 
283 /*
284  * Called by main() during pseudo-device attachment.  All we need
285  * to do is allocate enough space for devices to be configured later.
286  */
287 void
288 ccdattach(int num)
289 {
290 	mutex_init(&ccd_lock, MUTEX_DEFAULT, IPL_NONE);
291 
292 	/* Initialize the component buffer pool. */
293 	ccd_cache = pool_cache_init(sizeof(struct ccdbuf), 0,
294 	    0, 0, "ccdbuf", NULL, IPL_BIO, NULL, NULL, NULL);
295 }
296 
297 static int
298 ccdinit(struct ccd_softc *cs, char **cpaths, struct vnode **vpp,
299     struct lwp *l)
300 {
301 	struct ccdcinfo *ci = NULL;
302 	int ix;
303 	struct ccdgeom *ccg = &cs->sc_geom;
304 	char *tmppath;
305 	int error, path_alloced;
306 	uint64_t psize, minsize;
307 	unsigned secsize, maxsecsize;
308 
309 #ifdef DEBUG
310 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
311 		printf("%s: ccdinit\n", cs->sc_xname);
312 #endif
313 
314 	/* Allocate space for the component info. */
315 	cs->sc_cinfo = kmem_alloc(cs->sc_nccdisks * sizeof(*cs->sc_cinfo),
316 	    KM_SLEEP);
317 	tmppath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
318 
319 	cs->sc_size = 0;
320 
321 	/*
322 	 * Verify that each component piece exists and record
323 	 * relevant information about it.
324 	 */
325 	maxsecsize = 0;
326 	minsize = 0;
327 	for (ix = 0, path_alloced = 0; ix < cs->sc_nccdisks; ix++) {
328 		ci = &cs->sc_cinfo[ix];
329 		ci->ci_vp = vpp[ix];
330 
331 		/*
332 		 * Copy in the pathname of the component.
333 		 */
334 		memset(tmppath, 0, MAXPATHLEN);	/* sanity */
335 		error = copyinstr(cpaths[ix], tmppath,
336 		    MAXPATHLEN, &ci->ci_pathlen);
337 		if (ci->ci_pathlen == 0)
338 			error = EINVAL;
339 		if (error) {
340 #ifdef DEBUG
341 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
342 				printf("%s: can't copy path, error = %d\n",
343 				    cs->sc_xname, error);
344 #endif
345 			goto out;
346 		}
347 		ci->ci_path = kmem_alloc(ci->ci_pathlen, KM_SLEEP);
348 		memcpy(ci->ci_path, tmppath, ci->ci_pathlen);
349 		path_alloced++;
350 
351 		/*
352 		 * XXX: Cache the component's dev_t.
353 		 */
354 		ci->ci_dev = vpp[ix]->v_rdev;
355 
356 		/*
357 		 * Get partition information for the component.
358 		 */
359 		error = getdisksize(vpp[ix], &psize, &secsize);
360 		if (error) {
361 #ifdef DEBUG
362 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
363 				 printf("%s: %s: disksize failed, error = %d\n",
364 				     cs->sc_xname, ci->ci_path, error);
365 #endif
366 			goto out;
367 		}
368 
369 		/*
370 		 * Calculate the size, truncating to an interleave
371 		 * boundary if necessary.
372 		 */
373 		maxsecsize = secsize > maxsecsize ? secsize : maxsecsize;
374 		if (cs->sc_ileave > 1)
375 			psize -= psize % cs->sc_ileave;
376 
377 		if (psize == 0) {
378 #ifdef DEBUG
379 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
380 				printf("%s: %s: size == 0\n",
381 				    cs->sc_xname, ci->ci_path);
382 #endif
383 			error = ENODEV;
384 			goto out;
385 		}
386 
387 		if (minsize == 0 || psize < minsize)
388 			minsize = psize;
389 		ci->ci_size = psize;
390 		cs->sc_size += psize;
391 	}
392 
393 	/*
394 	 * Don't allow the interleave to be smaller than
395 	 * the biggest component sector.
396 	 */
397 	if ((cs->sc_ileave > 0) &&
398 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
399 #ifdef DEBUG
400 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
401 			printf("%s: interleave must be at least %d\n",
402 			    cs->sc_xname, (maxsecsize / DEV_BSIZE));
403 #endif
404 		error = EINVAL;
405 		goto out;
406 	}
407 
408 	/*
409 	 * If uniform interleave is desired set all sizes to that of
410 	 * the smallest component.
411 	 */
412 	if (cs->sc_flags & CCDF_UNIFORM) {
413 		for (ci = cs->sc_cinfo;
414 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
415 			ci->ci_size = minsize;
416 
417 		cs->sc_size = cs->sc_nccdisks * minsize;
418 	}
419 
420 	/*
421 	 * Construct the interleave table.
422 	 */
423 	ccdinterleave(cs);
424 
425 	/*
426 	 * Create pseudo-geometry based on 1MB cylinders.  It's
427 	 * pretty close.
428 	 */
429 	ccg->ccg_secsize = DEV_BSIZE;
430 	ccg->ccg_ntracks = 1;
431 	ccg->ccg_nsectors = 1024 * (1024 / ccg->ccg_secsize);
432 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
433 
434 	if (cs->sc_ileave > 0)
435 	        aprint_normal("%s: Interleaving %d component%s "
436 	            "(%d block interleave)\n", cs->sc_xname,
437         	    cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : ""),
438         	    cs->sc_ileave);
439 	else
440 	        aprint_normal("%s: Concatenating %d component%s\n",
441 	            cs->sc_xname,
442         	    cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : ""));
443 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
444 		ci = &cs->sc_cinfo[ix];
445 		aprint_normal("%s: %s (%ju blocks)\n", cs->sc_xname,
446 		    ci->ci_path, (uintmax_t)ci->ci_size);
447 	}
448 	aprint_normal("%s: total %ju blocks\n", cs->sc_xname, cs->sc_size);
449 
450 	/*
451 	 * Create thread to handle deferred I/O.
452 	 */
453 	cs->sc_zap = false;
454 	error = kthread_create(PRI_BIO, KTHREAD_MPSAFE, NULL, ccdthread,
455 	    cs, &cs->sc_thread, "%s", cs->sc_xname);
456 	if (error) {
457 		printf("ccdinit: can't create thread: %d\n", error);
458 		goto out;
459 	}
460 
461 	/*
462 	 * Only now that everything is set up can we enable the device.
463 	 */
464 	mutex_enter(cs->sc_iolock);
465 	cs->sc_flags |= CCDF_INITED;
466 	mutex_exit(cs->sc_iolock);
467 	kmem_free(tmppath, MAXPATHLEN);
468 	return (0);
469 
470  out:
471 	for (ix = 0; ix < path_alloced; ix++) {
472 		kmem_free(cs->sc_cinfo[ix].ci_path,
473 		    cs->sc_cinfo[ix].ci_pathlen);
474 	}
475 	kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo));
476 	kmem_free(tmppath, MAXPATHLEN);
477 	return (error);
478 }
479 
480 static void
481 ccdinterleave(struct ccd_softc *cs)
482 {
483 	struct ccdcinfo *ci, *smallci;
484 	struct ccdiinfo *ii;
485 	daddr_t bn, lbn;
486 	int ix;
487 	u_long size;
488 
489 #ifdef DEBUG
490 	if (ccddebug & CCDB_INIT)
491 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
492 #endif
493 	/*
494 	 * Allocate an interleave table.
495 	 * Chances are this is too big, but we don't care.
496 	 */
497 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
498 	cs->sc_itable = kmem_zalloc(size, KM_SLEEP);
499 
500 	/*
501 	 * Trivial case: no interleave (actually interleave of disk size).
502 	 * Each table entry represents a single component in its entirety.
503 	 */
504 	if (cs->sc_ileave == 0) {
505 		bn = 0;
506 		ii = cs->sc_itable;
507 
508 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
509 			/* Allocate space for ii_index. */
510 			ii->ii_indexsz = sizeof(int);
511 			ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
512 			ii->ii_ndisk = 1;
513 			ii->ii_startblk = bn;
514 			ii->ii_startoff = 0;
515 			ii->ii_index[0] = ix;
516 			bn += cs->sc_cinfo[ix].ci_size;
517 			ii++;
518 		}
519 		ii->ii_ndisk = 0;
520 #ifdef DEBUG
521 		if (ccddebug & CCDB_INIT)
522 			printiinfo(cs->sc_itable);
523 #endif
524 		return;
525 	}
526 
527 	/*
528 	 * The following isn't fast or pretty; it doesn't have to be.
529 	 */
530 	size = 0;
531 	bn = lbn = 0;
532 	for (ii = cs->sc_itable; ; ii++) {
533 		/* Allocate space for ii_index. */
534 		ii->ii_indexsz = sizeof(int) * cs->sc_nccdisks;
535 		ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
536 
537 		/*
538 		 * Locate the smallest of the remaining components
539 		 */
540 		smallci = NULL;
541 		for (ci = cs->sc_cinfo;
542 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
543 			if (ci->ci_size > size &&
544 			    (smallci == NULL ||
545 			     ci->ci_size < smallci->ci_size))
546 				smallci = ci;
547 
548 		/*
549 		 * Nobody left, all done
550 		 */
551 		if (smallci == NULL) {
552 			ii->ii_ndisk = 0;
553 			break;
554 		}
555 
556 		/*
557 		 * Record starting logical block and component offset
558 		 */
559 		ii->ii_startblk = bn / cs->sc_ileave;
560 		ii->ii_startoff = lbn;
561 
562 		/*
563 		 * Determine how many disks take part in this interleave
564 		 * and record their indices.
565 		 */
566 		ix = 0;
567 		for (ci = cs->sc_cinfo;
568 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
569 			if (ci->ci_size >= smallci->ci_size)
570 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
571 		ii->ii_ndisk = ix;
572 		bn += ix * (smallci->ci_size - size);
573 		lbn = smallci->ci_size / cs->sc_ileave;
574 		size = smallci->ci_size;
575 	}
576 #ifdef DEBUG
577 	if (ccddebug & CCDB_INIT)
578 		printiinfo(cs->sc_itable);
579 #endif
580 }
581 
582 /* ARGSUSED */
583 static int
584 ccdopen(dev_t dev, int flags, int fmt, struct lwp *l)
585 {
586 	int unit = ccdunit(dev);
587 	struct ccd_softc *cs;
588 	struct disklabel *lp;
589 	int error = 0, part, pmask;
590 
591 #ifdef DEBUG
592 	if (ccddebug & CCDB_FOLLOW)
593 		printf("ccdopen(0x%"PRIx64", 0x%x)\n", dev, flags);
594 #endif
595 	if ((cs = ccdget(unit)) == NULL)
596 		return ENXIO;
597 
598 	mutex_enter(&cs->sc_dvlock);
599 
600 	lp = cs->sc_dkdev.dk_label;
601 
602 	part = DISKPART(dev);
603 	pmask = (1 << part);
604 
605 	/*
606 	 * If we're initialized, check to see if there are any other
607 	 * open partitions.  If not, then it's safe to update
608 	 * the in-core disklabel.  Only read the disklabel if it is
609 	 * not already valid.
610 	 */
611 	if ((cs->sc_flags & (CCDF_INITED|CCDF_VLABEL)) == CCDF_INITED &&
612 	    cs->sc_dkdev.dk_openmask == 0)
613 		ccdgetdisklabel(dev);
614 
615 	/* Check that the partition exists. */
616 	if (part != RAW_PART) {
617 		if (((cs->sc_flags & CCDF_INITED) == 0) ||
618 		    ((part >= lp->d_npartitions) ||
619 		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
620 			error = ENXIO;
621 			goto done;
622 		}
623 	}
624 
625 	/* Prevent our unit from being unconfigured while open. */
626 	switch (fmt) {
627 	case S_IFCHR:
628 		cs->sc_dkdev.dk_copenmask |= pmask;
629 		break;
630 
631 	case S_IFBLK:
632 		cs->sc_dkdev.dk_bopenmask |= pmask;
633 		break;
634 	}
635 	cs->sc_dkdev.dk_openmask =
636 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
637 
638  done:
639 	mutex_exit(&cs->sc_dvlock);
640 	return (error);
641 }
642 
643 /* ARGSUSED */
644 static int
645 ccdclose(dev_t dev, int flags, int fmt, struct lwp *l)
646 {
647 	int unit = ccdunit(dev);
648 	struct ccd_softc *cs;
649 	int part;
650 
651 #ifdef DEBUG
652 	if (ccddebug & CCDB_FOLLOW)
653 		printf("ccdclose(0x%"PRIx64", 0x%x)\n", dev, flags);
654 #endif
655 
656 	if ((cs = ccdget(unit)) == NULL)
657 		return ENXIO;
658 
659 	mutex_enter(&cs->sc_dvlock);
660 
661 	part = DISKPART(dev);
662 
663 	/* ...that much closer to allowing unconfiguration... */
664 	switch (fmt) {
665 	case S_IFCHR:
666 		cs->sc_dkdev.dk_copenmask &= ~(1 << part);
667 		break;
668 
669 	case S_IFBLK:
670 		cs->sc_dkdev.dk_bopenmask &= ~(1 << part);
671 		break;
672 	}
673 	cs->sc_dkdev.dk_openmask =
674 	    cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;
675 
676 	if (cs->sc_dkdev.dk_openmask == 0) {
677 		if ((cs->sc_flags & CCDF_KLABEL) == 0)
678 			cs->sc_flags &= ~CCDF_VLABEL;
679 	}
680 
681 	mutex_exit(&cs->sc_dvlock);
682 	return (0);
683 }
684 
685 static bool
686 ccdbackoff(struct ccd_softc *cs)
687 {
688 
689 	/* XXX Arbitrary, should be a uvm call. */
690 	return uvmexp.free < (uvmexp.freemin >> 1) &&
691 	    disk_isbusy(&cs->sc_dkdev);
692 }
693 
694 static void
695 ccdthread(void *cookie)
696 {
697 	struct ccd_softc *cs;
698 
699 	cs = cookie;
700 
701 #ifdef DEBUG
702  	if (ccddebug & CCDB_FOLLOW)
703  		printf("ccdthread: hello\n");
704 #endif
705 
706 	mutex_enter(cs->sc_iolock);
707 	while (__predict_true(!cs->sc_zap)) {
708 		if (bufq_peek(cs->sc_bufq) == NULL) {
709 			/* Nothing to do. */
710 			cv_wait(&cs->sc_push, cs->sc_iolock);
711 			continue;
712 		}
713 		if (ccdbackoff(cs)) {
714 			/* Wait for memory to become available. */
715 			(void)cv_timedwait(&cs->sc_push, cs->sc_iolock, 1);
716 			continue;
717 		}
718 #ifdef DEBUG
719  		if (ccddebug & CCDB_FOLLOW)
720  			printf("ccdthread: dispatching I/O\n");
721 #endif
722 		ccdstart(cs);
723 		mutex_enter(cs->sc_iolock);
724 	}
725 	cs->sc_thread = NULL;
726 	mutex_exit(cs->sc_iolock);
727 #ifdef DEBUG
728  	if (ccddebug & CCDB_FOLLOW)
729  		printf("ccdthread: goodbye\n");
730 #endif
731 	kthread_exit(0);
732 }
733 
734 static void
735 ccdstrategy(struct buf *bp)
736 {
737 	int unit = ccdunit(bp->b_dev);
738 	struct ccd_softc *cs;
739 	if ((cs = ccdget(unit)) == NULL)
740 		return;
741 
742 	/* Must be open or reading label. */
743 	KASSERT(cs->sc_dkdev.dk_openmask != 0 ||
744 	    (cs->sc_flags & CCDF_RLABEL) != 0);
745 
746 	mutex_enter(cs->sc_iolock);
747 	/* Synchronize with device init/uninit. */
748 	if (__predict_false((cs->sc_flags & CCDF_INITED) == 0)) {
749 		mutex_exit(cs->sc_iolock);
750 #ifdef DEBUG
751  		if (ccddebug & CCDB_FOLLOW)
752  			printf("ccdstrategy: unit %d: not inited\n", unit);
753 #endif
754  		bp->b_error = ENXIO;
755  		bp->b_resid = bp->b_bcount;
756  		biodone(bp);
757 		return;
758 	}
759 
760 	/* Defer to thread if system is low on memory. */
761 	bufq_put(cs->sc_bufq, bp);
762 	if (__predict_false(ccdbackoff(cs))) {
763 		mutex_exit(cs->sc_iolock);
764 #ifdef DEBUG
765  		if (ccddebug & CCDB_FOLLOW)
766  			printf("ccdstrategy: holding off on I/O\n");
767 #endif
768 		return;
769 	}
770 	ccdstart(cs);
771 }
772 
773 static void
774 ccdstart(struct ccd_softc *cs)
775 {
776 	daddr_t blkno;
777 	int wlabel;
778 	struct disklabel *lp;
779 	long bcount, rcount;
780 	struct ccdbuf *cbp;
781 	char *addr;
782 	daddr_t bn;
783 	vnode_t *vp;
784 	buf_t *bp;
785 
786 	KASSERT(mutex_owned(cs->sc_iolock));
787 
788 	disk_busy(&cs->sc_dkdev);
789 	bp = bufq_get(cs->sc_bufq);
790 	KASSERT(bp != NULL);
791 
792 #ifdef DEBUG
793 	if (ccddebug & CCDB_FOLLOW)
794 		printf("ccdstart(%s, %p)\n", cs->sc_xname, bp);
795 #endif
796 
797 	/* If it's a nil transfer, wake up the top half now. */
798 	if (bp->b_bcount == 0)
799 		goto done;
800 
801 	lp = cs->sc_dkdev.dk_label;
802 
803 	/*
804 	 * Do bounds checking and adjust transfer.  If there's an
805 	 * error, the bounds check will flag that for us.  Convert
806 	 * the partition relative block number to an absolute.
807 	 */
808 	blkno = bp->b_blkno;
809 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
810 	if (DISKPART(bp->b_dev) != RAW_PART) {
811 		if (bounds_check_with_label(&cs->sc_dkdev, bp, wlabel) <= 0)
812 			goto done;
813 		blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset;
814 	}
815 	mutex_exit(cs->sc_iolock);
816 	bp->b_rawblkno = blkno;
817 
818 	/* Allocate the component buffers and start I/O! */
819 	bp->b_resid = bp->b_bcount;
820 	bn = bp->b_rawblkno;
821 	addr = bp->b_data;
822 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
823 		cbp = ccdbuffer(cs, bp, bn, addr, bcount);
824 		rcount = cbp->cb_buf.b_bcount;
825 		bn += btodb(rcount);
826 		addr += rcount;
827 		vp = cbp->cb_buf.b_vp;
828 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
829 			mutex_enter(vp->v_interlock);
830 			vp->v_numoutput++;
831 			mutex_exit(vp->v_interlock);
832 		}
833 		(void)VOP_STRATEGY(vp, &cbp->cb_buf);
834 	}
835 	return;
836 
837  done:
838 	disk_unbusy(&cs->sc_dkdev, 0, 0);
839 	cv_broadcast(&cs->sc_stop);
840 	cv_broadcast(&cs->sc_push);
841 	mutex_exit(cs->sc_iolock);
842 	bp->b_resid = bp->b_bcount;
843 	biodone(bp);
844 }
845 
846 /*
847  * Build a component buffer header.
848  */
849 static struct ccdbuf *
850 ccdbuffer(struct ccd_softc *cs, struct buf *bp, daddr_t bn, void *addr,
851     long bcount)
852 {
853 	struct ccdcinfo *ci;
854 	struct ccdbuf *cbp;
855 	daddr_t cbn, cboff;
856 	u_int64_t cbc;
857 	int ccdisk;
858 
859 #ifdef DEBUG
860 	if (ccddebug & CCDB_IO)
861 		printf("ccdbuffer(%p, %p, %" PRId64 ", %p, %ld)\n",
862 		       cs, bp, bn, addr, bcount);
863 #endif
864 	/*
865 	 * Determine which component bn falls in.
866 	 */
867 	cbn = bn;
868 	cboff = 0;
869 
870 	/*
871 	 * Serially concatenated
872 	 */
873 	if (cs->sc_ileave == 0) {
874 		daddr_t sblk;
875 
876 		sblk = 0;
877 		for (ccdisk = 0, ci = &cs->sc_cinfo[ccdisk];
878 		    cbn >= sblk + ci->ci_size;
879 		    ccdisk++, ci = &cs->sc_cinfo[ccdisk])
880 			sblk += ci->ci_size;
881 		cbn -= sblk;
882 	}
883 	/*
884 	 * Interleaved
885 	 */
886 	else {
887 		struct ccdiinfo *ii;
888 		int off;
889 
890 		cboff = cbn % cs->sc_ileave;
891 		cbn /= cs->sc_ileave;
892 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++)
893 			if (ii->ii_startblk > cbn)
894 				break;
895 		ii--;
896 		off = cbn - ii->ii_startblk;
897 		if (ii->ii_ndisk == 1) {
898 			ccdisk = ii->ii_index[0];
899 			cbn = ii->ii_startoff + off;
900 		} else {
901 			ccdisk = ii->ii_index[off % ii->ii_ndisk];
902 			cbn = ii->ii_startoff + off / ii->ii_ndisk;
903 		}
904 		cbn *= cs->sc_ileave;
905 		ci = &cs->sc_cinfo[ccdisk];
906 	}
907 
908 	/*
909 	 * Fill in the component buf structure.
910 	 */
911 	cbp = CCD_GETBUF();
912 	KASSERT(cbp != NULL);
913 	buf_init(&cbp->cb_buf);
914 	cbp->cb_buf.b_flags = bp->b_flags;
915 	cbp->cb_buf.b_oflags = bp->b_oflags;
916 	cbp->cb_buf.b_cflags = bp->b_cflags;
917 	cbp->cb_buf.b_iodone = ccdiodone;
918 	cbp->cb_buf.b_proc = bp->b_proc;
919 	cbp->cb_buf.b_dev = ci->ci_dev;
920 	cbp->cb_buf.b_blkno = cbn + cboff;
921 	cbp->cb_buf.b_data = addr;
922 	cbp->cb_buf.b_vp = ci->ci_vp;
923 	cbp->cb_buf.b_objlock = ci->ci_vp->v_interlock;
924 	if (cs->sc_ileave == 0)
925 		cbc = dbtob((u_int64_t)(ci->ci_size - cbn));
926 	else
927 		cbc = dbtob((u_int64_t)(cs->sc_ileave - cboff));
928 	cbp->cb_buf.b_bcount = cbc < bcount ? cbc : bcount;
929 
930 	/*
931 	 * context for ccdiodone
932 	 */
933 	cbp->cb_obp = bp;
934 	cbp->cb_sc = cs;
935 	cbp->cb_comp = ccdisk;
936 
937 	BIO_COPYPRIO(&cbp->cb_buf, bp);
938 
939 #ifdef DEBUG
940 	if (ccddebug & CCDB_IO)
941 		printf(" dev 0x%"PRIx64"(u%lu): cbp %p bn %" PRId64 " addr %p"
942 		       " bcnt %d\n",
943 		    ci->ci_dev, (unsigned long) (ci-cs->sc_cinfo), cbp,
944 		    cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
945 		    cbp->cb_buf.b_bcount);
946 #endif
947 
948 	return (cbp);
949 }
950 
951 /*
952  * Called at interrupt time.
953  * Mark the component as done and if all components are done,
954  * take a ccd interrupt.
955  */
956 static void
957 ccdiodone(struct buf *vbp)
958 {
959 	struct ccdbuf *cbp = (struct ccdbuf *) vbp;
960 	struct buf *bp = cbp->cb_obp;
961 	struct ccd_softc *cs = cbp->cb_sc;
962 	int count;
963 
964 #ifdef DEBUG
965 	if (ccddebug & CCDB_FOLLOW)
966 		printf("ccdiodone(%p)\n", cbp);
967 	if (ccddebug & CCDB_IO) {
968 		printf("ccdiodone: bp %p bcount %d resid %d\n",
969 		       bp, bp->b_bcount, bp->b_resid);
970 		printf(" dev 0x%"PRIx64"(u%d), cbp %p bn %" PRId64 " addr %p"
971 		       " bcnt %d\n",
972 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
973 		       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
974 		       cbp->cb_buf.b_bcount);
975 	}
976 #endif
977 
978 	if (cbp->cb_buf.b_error != 0) {
979 		bp->b_error = cbp->cb_buf.b_error;
980 		printf("%s: error %d on component %d\n",
981 		       cs->sc_xname, bp->b_error, cbp->cb_comp);
982 	}
983 	count = cbp->cb_buf.b_bcount;
984 	buf_destroy(&cbp->cb_buf);
985 	CCD_PUTBUF(cbp);
986 
987 	/*
988 	 * If all done, "interrupt".
989 	 */
990 	mutex_enter(cs->sc_iolock);
991 	bp->b_resid -= count;
992 	if (bp->b_resid < 0)
993 		panic("ccdiodone: count");
994 	if (bp->b_resid == 0) {
995 		/*
996 		 * Request is done for better or worse, wakeup the top half.
997 		 */
998 		if (bp->b_error != 0)
999 			bp->b_resid = bp->b_bcount;
1000 		disk_unbusy(&cs->sc_dkdev, (bp->b_bcount - bp->b_resid),
1001 		    (bp->b_flags & B_READ));
1002 		if (!disk_isbusy(&cs->sc_dkdev)) {
1003 			if (bufq_peek(cs->sc_bufq) != NULL) {
1004 				cv_broadcast(&cs->sc_push);
1005 			}
1006 			cv_broadcast(&cs->sc_stop);
1007 		}
1008 		mutex_exit(cs->sc_iolock);
1009 		biodone(bp);
1010 	} else
1011 		mutex_exit(cs->sc_iolock);
1012 }
1013 
1014 /* ARGSUSED */
1015 static int
1016 ccdread(dev_t dev, struct uio *uio, int flags)
1017 {
1018 	int unit = ccdunit(dev);
1019 	struct ccd_softc *cs;
1020 
1021 #ifdef DEBUG
1022 	if (ccddebug & CCDB_FOLLOW)
1023 		printf("ccdread(0x%"PRIx64", %p)\n", dev, uio);
1024 #endif
1025 	if ((cs = ccdget(unit)) == NULL)
1026 		return 0;
1027 
1028 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1029 	if ((cs->sc_flags & CCDF_INITED) == 0)
1030 		return (ENXIO);
1031 
1032 	return (physio(ccdstrategy, NULL, dev, B_READ, minphys, uio));
1033 }
1034 
1035 /* ARGSUSED */
1036 static int
1037 ccdwrite(dev_t dev, struct uio *uio, int flags)
1038 {
1039 	int unit = ccdunit(dev);
1040 	struct ccd_softc *cs;
1041 
1042 #ifdef DEBUG
1043 	if (ccddebug & CCDB_FOLLOW)
1044 		printf("ccdwrite(0x%"PRIx64", %p)\n", dev, uio);
1045 #endif
1046 	if ((cs = ccdget(unit)) == NULL)
1047 		return ENOENT;
1048 
1049 	/* Unlocked advisory check, ccdstrategy check is synchronous. */
1050 	if ((cs->sc_flags & CCDF_INITED) == 0)
1051 		return (ENXIO);
1052 
1053 	return (physio(ccdstrategy, NULL, dev, B_WRITE, minphys, uio));
1054 }
1055 
1056 static int
1057 ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1058 {
1059 	int unit = ccdunit(dev);
1060 	int i, j, lookedup = 0, error = 0;
1061 	int part, pmask;
1062 	struct ccd_softc *cs;
1063 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1064 	kauth_cred_t uc;
1065 	char **cpp;
1066 	struct pathbuf *pb;
1067 	struct vnode **vpp;
1068 #ifdef __HAVE_OLD_DISKLABEL
1069 	struct disklabel newlabel;
1070 #endif
1071 
1072 	if ((cs = ccdget(unit)) == NULL)
1073 		return ENOENT;
1074 	uc = kauth_cred_get();
1075 
1076 /*
1077  * Compat code must not be called if on a platform where
1078  * sizeof (size_t) == sizeof (uint64_t) as CCDIOCSET will
1079  * be the same as CCDIOCSET_60
1080  */
1081 #if defined(COMPAT_60) && !defined(_LP64)
1082 	switch (cmd) {
1083 	case CCDIOCSET_60: {
1084 		struct ccd_ioctl ccionew;
1085        		struct ccd_ioctl_60 *ccio60 =
1086        		    (struct ccd_ioctl_60 *)data;
1087 		ccionew.ccio_disks = ccio->ccio_disks;
1088 		ccionew.ccio_ndisks = ccio->ccio_ndisks;
1089 		ccionew.ccio_ileave = ccio->ccio_ileave;
1090 		ccionew.ccio_flags = ccio->ccio_flags;
1091 		ccionew.ccio_unit = ccio->ccio_unit;
1092 		error = ccdioctl(dev, CCDIOCSET, &ccionew, flag, l);
1093 		if (!error) {
1094 			/* Copy data back, adjust types if necessary */
1095 			ccio60->ccio_disks = ccionew.ccio_disks;
1096 			ccio60->ccio_ndisks = ccionew.ccio_ndisks;
1097 			ccio60->ccio_ileave = ccionew.ccio_ileave;
1098 			ccio60->ccio_flags = ccionew.ccio_flags;
1099 			ccio60->ccio_unit = ccionew.ccio_unit;
1100 			ccio60->ccio_size = (size_t)ccionew.ccio_size;
1101 		}
1102 		return error;
1103 		}
1104 		break;
1105 
1106 	case CCDIOCCLR_60:
1107 		/*
1108 		 * ccio_size member not used, so existing struct OK
1109 		 * drop through to existing non-compat version
1110 		 */
1111 		cmd = CCDIOCCLR;
1112 		break;
1113 	}
1114 #endif /* COMPAT_60 && !_LP64*/
1115 
1116 	/* Must be open for writes for these commands... */
1117 	switch (cmd) {
1118 	case CCDIOCSET:
1119 	case CCDIOCCLR:
1120 	case DIOCSDINFO:
1121 	case DIOCWDINFO:
1122 #ifdef __HAVE_OLD_DISKLABEL
1123 	case ODIOCSDINFO:
1124 	case ODIOCWDINFO:
1125 #endif
1126 	case DIOCKLABEL:
1127 	case DIOCWLABEL:
1128 		if ((flag & FWRITE) == 0)
1129 			return (EBADF);
1130 	}
1131 
1132 	mutex_enter(&cs->sc_dvlock);
1133 
1134 	/* Must be initialized for these... */
1135 	switch (cmd) {
1136 	case CCDIOCCLR:
1137 	case DIOCGDINFO:
1138 	case DIOCCACHESYNC:
1139 	case DIOCSDINFO:
1140 	case DIOCWDINFO:
1141 	case DIOCGPART:
1142 	case DIOCWLABEL:
1143 	case DIOCKLABEL:
1144 	case DIOCGDEFLABEL:
1145 #ifdef __HAVE_OLD_DISKLABEL
1146 	case ODIOCGDINFO:
1147 	case ODIOCSDINFO:
1148 	case ODIOCWDINFO:
1149 	case ODIOCGDEFLABEL:
1150 #endif
1151 		if ((cs->sc_flags & CCDF_INITED) == 0) {
1152 			error = ENXIO;
1153 			goto out;
1154 		}
1155 	}
1156 
1157 	switch (cmd) {
1158 	case CCDIOCSET:
1159 		if (cs->sc_flags & CCDF_INITED) {
1160 			error = EBUSY;
1161 			goto out;
1162 		}
1163 
1164 		/* Validate the flags. */
1165 		if ((ccio->ccio_flags & CCDF_USERMASK) != ccio->ccio_flags) {
1166 			error = EINVAL;
1167 			goto out;
1168 		}
1169 
1170 		if (ccio->ccio_ndisks > CCD_MAXNDISKS ||
1171 		    ccio->ccio_ndisks == 0) {
1172 			error = EINVAL;
1173 			goto out;
1174 		}
1175 
1176 		/* Fill in some important bits. */
1177 		cs->sc_ileave = ccio->ccio_ileave;
1178 		cs->sc_nccdisks = ccio->ccio_ndisks;
1179 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1180 
1181 		/*
1182 		 * Allocate space for and copy in the array of
1183 		 * component pathnames and device numbers.
1184 		 */
1185 		cpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*cpp), KM_SLEEP);
1186 		vpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*vpp), KM_SLEEP);
1187 		error = copyin(ccio->ccio_disks, cpp,
1188 		    ccio->ccio_ndisks * sizeof(*cpp));
1189 		if (error) {
1190 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1191 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1192 			goto out;
1193 		}
1194 
1195 #ifdef DEBUG
1196 		if (ccddebug & CCDB_INIT)
1197 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1198 				printf("ccdioctl: component %d: %p\n",
1199 				    i, cpp[i]);
1200 #endif
1201 
1202 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1203 #ifdef DEBUG
1204 			if (ccddebug & CCDB_INIT)
1205 				printf("ccdioctl: lookedup = %d\n", lookedup);
1206 #endif
1207 			error = pathbuf_copyin(cpp[i], &pb);
1208 			if (error == 0) {
1209 				error = dk_lookup(pb, l, &vpp[i]);
1210 			}
1211 			pathbuf_destroy(pb);
1212 			if (error != 0) {
1213 				for (j = 0; j < lookedup; ++j)
1214 					(void)vn_close(vpp[j], FREAD|FWRITE,
1215 					    uc);
1216 				kmem_free(vpp, ccio->ccio_ndisks *
1217 				    sizeof(*vpp));
1218 				kmem_free(cpp, ccio->ccio_ndisks *
1219 				    sizeof(*cpp));
1220 				goto out;
1221 			}
1222 			++lookedup;
1223 		}
1224 
1225 		/* Attach the disk. */
1226 		disk_attach(&cs->sc_dkdev);
1227 		bufq_alloc(&cs->sc_bufq, "fcfs", 0);
1228 
1229 		/*
1230 		 * Initialize the ccd.  Fills in the softc for us.
1231 		 */
1232 		if ((error = ccdinit(cs, cpp, vpp, l)) != 0) {
1233 			for (j = 0; j < lookedup; ++j)
1234 				(void)vn_close(vpp[j], FREAD|FWRITE,
1235 				    uc);
1236 			kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1237 			kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1238 			disk_detach(&cs->sc_dkdev);
1239 			bufq_free(cs->sc_bufq);
1240 			goto out;
1241 		}
1242 
1243 		/* We can free the temporary variables now. */
1244 		kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
1245 		kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
1246 
1247 		/*
1248 		 * The ccd has been successfully initialized, so
1249 		 * we can place it into the array.  Don't try to
1250 		 * read the disklabel until the disk has been attached,
1251 		 * because space for the disklabel is allocated
1252 		 * in disk_attach();
1253 		 */
1254 		ccio->ccio_unit = unit;
1255 		ccio->ccio_size = cs->sc_size;
1256 
1257 		/* Try and read the disklabel. */
1258 		ccdgetdisklabel(dev);
1259 		break;
1260 
1261 	case CCDIOCCLR:
1262 		/*
1263 		 * Don't unconfigure if any other partitions are open
1264 		 * or if both the character and block flavors of this
1265 		 * partition are open.
1266 		 */
1267 		part = DISKPART(dev);
1268 		pmask = (1 << part);
1269 		if ((cs->sc_dkdev.dk_openmask & ~pmask) ||
1270 		    ((cs->sc_dkdev.dk_bopenmask & pmask) &&
1271 		    (cs->sc_dkdev.dk_copenmask & pmask))) {
1272 			error = EBUSY;
1273 			goto out;
1274 		}
1275 
1276 		/* Stop new I/O, wait for in-flight I/O to complete. */
1277 		mutex_enter(cs->sc_iolock);
1278 		cs->sc_flags &= ~(CCDF_INITED|CCDF_VLABEL);
1279 		cs->sc_zap = true;
1280 		while (disk_isbusy(&cs->sc_dkdev) ||
1281 		    bufq_peek(cs->sc_bufq) != NULL ||
1282 		    cs->sc_thread != NULL) {
1283 			cv_broadcast(&cs->sc_push);
1284 			(void)cv_timedwait(&cs->sc_stop, cs->sc_iolock, hz);
1285 		}
1286 		mutex_exit(cs->sc_iolock);
1287 
1288 		/*
1289 		 * Free ccd_softc information and clear entry.
1290 		 */
1291 
1292 		/* Close the components and free their pathnames. */
1293 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1294 			/*
1295 			 * XXX: this close could potentially fail and
1296 			 * cause Bad Things.  Maybe we need to force
1297 			 * the close to happen?
1298 			 */
1299 #ifdef DEBUG
1300 			if (ccddebug & CCDB_VNODE)
1301 				vprint("CCDIOCCLR: vnode info",
1302 				    cs->sc_cinfo[i].ci_vp);
1303 #endif
1304 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1305 			    uc);
1306 			kmem_free(cs->sc_cinfo[i].ci_path,
1307 			    cs->sc_cinfo[i].ci_pathlen);
1308 		}
1309 
1310 		/* Free interleave index. */
1311 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) {
1312 			kmem_free(cs->sc_itable[i].ii_index,
1313 			    cs->sc_itable[i].ii_indexsz);
1314 		}
1315 
1316 		/* Free component info and interleave table. */
1317 		kmem_free(cs->sc_cinfo, cs->sc_nccdisks *
1318 		    sizeof(struct ccdcinfo));
1319 		kmem_free(cs->sc_itable, (cs->sc_nccdisks + 1) *
1320 		    sizeof(struct ccdiinfo));
1321 
1322 		aprint_normal("%s: detached\n", cs->sc_xname);
1323 
1324 		/* Detach the disk. */
1325 		disk_detach(&cs->sc_dkdev);
1326 		bufq_free(cs->sc_bufq);
1327 		ccdput(cs);
1328 		/* Don't break, otherwise cs is read again. */
1329 		return 0;
1330 
1331 	case DIOCGDINFO:
1332 		*(struct disklabel *)data = *(cs->sc_dkdev.dk_label);
1333 		break;
1334 
1335 #ifdef __HAVE_OLD_DISKLABEL
1336 	case ODIOCGDINFO:
1337 		newlabel = *(cs->sc_dkdev.dk_label);
1338 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1339 			return ENOTTY;
1340 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1341 		break;
1342 #endif
1343 
1344 	case DIOCGPART:
1345 		((struct partinfo *)data)->disklab = cs->sc_dkdev.dk_label;
1346 		((struct partinfo *)data)->part =
1347 		    &cs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1348 		break;
1349 
1350 	case DIOCCACHESYNC:
1351 		/*
1352 		 * XXX Do we really need to care about having a writable
1353 		 * file descriptor here?
1354 		 */
1355 		if ((flag & FWRITE) == 0)
1356 			return (EBADF);
1357 
1358 		/*
1359 		 * We pass this call down to all components and report
1360 		 * the first error we encounter.
1361 		 */
1362 		for (error = 0, i = 0; i < cs->sc_nccdisks; i++) {
1363 			j = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, data,
1364 				      flag, uc);
1365 			if (j != 0 && error == 0)
1366 				error = j;
1367 		}
1368 		break;
1369 
1370 	case DIOCWDINFO:
1371 	case DIOCSDINFO:
1372 #ifdef __HAVE_OLD_DISKLABEL
1373 	case ODIOCWDINFO:
1374 	case ODIOCSDINFO:
1375 #endif
1376 	{
1377 		struct disklabel *lp;
1378 #ifdef __HAVE_OLD_DISKLABEL
1379 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1380 			memset(&newlabel, 0, sizeof newlabel);
1381 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1382 			lp = &newlabel;
1383 		} else
1384 #endif
1385 		lp = (struct disklabel *)data;
1386 
1387 		cs->sc_flags |= CCDF_LABELLING;
1388 
1389 		error = setdisklabel(cs->sc_dkdev.dk_label,
1390 		    lp, 0, cs->sc_dkdev.dk_cpulabel);
1391 		if (error == 0) {
1392 			if (cmd == DIOCWDINFO
1393 #ifdef __HAVE_OLD_DISKLABEL
1394 			    || cmd == ODIOCWDINFO
1395 #endif
1396 			   )
1397 				error = writedisklabel(CCDLABELDEV(dev),
1398 				    ccdstrategy, cs->sc_dkdev.dk_label,
1399 				    cs->sc_dkdev.dk_cpulabel);
1400 		}
1401 
1402 		cs->sc_flags &= ~CCDF_LABELLING;
1403 		break;
1404 	}
1405 
1406 	case DIOCKLABEL:
1407 		if (*(int *)data != 0)
1408 			cs->sc_flags |= CCDF_KLABEL;
1409 		else
1410 			cs->sc_flags &= ~CCDF_KLABEL;
1411 		break;
1412 
1413 	case DIOCWLABEL:
1414 		if (*(int *)data != 0)
1415 			cs->sc_flags |= CCDF_WLABEL;
1416 		else
1417 			cs->sc_flags &= ~CCDF_WLABEL;
1418 		break;
1419 
1420 	case DIOCGDEFLABEL:
1421 		ccdgetdefaultlabel(cs, (struct disklabel *)data);
1422 		break;
1423 
1424 #ifdef __HAVE_OLD_DISKLABEL
1425 	case ODIOCGDEFLABEL:
1426 		ccdgetdefaultlabel(cs, &newlabel);
1427 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1428 			return ENOTTY;
1429 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1430 		break;
1431 #endif
1432 
1433 	default:
1434 		error = ENOTTY;
1435 	}
1436 
1437  out:
1438 	mutex_exit(&cs->sc_dvlock);
1439 	return (error);
1440 }
1441 
1442 static int
1443 ccdsize(dev_t dev)
1444 {
1445 	struct ccd_softc *cs;
1446 	struct disklabel *lp;
1447 	int part, unit, omask, size;
1448 
1449 	unit = ccdunit(dev);
1450 	if ((cs = ccdget(unit)) == NULL)
1451 		return -1;
1452 
1453 	if ((cs->sc_flags & CCDF_INITED) == 0)
1454 		return (-1);
1455 
1456 	part = DISKPART(dev);
1457 	omask = cs->sc_dkdev.dk_openmask & (1 << part);
1458 	lp = cs->sc_dkdev.dk_label;
1459 
1460 	if (omask == 0 && ccdopen(dev, 0, S_IFBLK, curlwp))
1461 		return (-1);
1462 
1463 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1464 		size = -1;
1465 	else
1466 		size = lp->d_partitions[part].p_size *
1467 		    (lp->d_secsize / DEV_BSIZE);
1468 
1469 	if (omask == 0 && ccdclose(dev, 0, S_IFBLK, curlwp))
1470 		return (-1);
1471 
1472 	return (size);
1473 }
1474 
1475 static void
1476 ccdgetdefaultlabel(struct ccd_softc *cs, struct disklabel *lp)
1477 {
1478 	struct ccdgeom *ccg = &cs->sc_geom;
1479 
1480 	memset(lp, 0, sizeof(*lp));
1481 
1482 	lp->d_secperunit = cs->sc_size;
1483 	lp->d_secsize = ccg->ccg_secsize;
1484 	lp->d_nsectors = ccg->ccg_nsectors;
1485 	lp->d_ntracks = ccg->ccg_ntracks;
1486 	lp->d_ncylinders = ccg->ccg_ncylinders;
1487 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1488 
1489 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1490 	lp->d_type = DTYPE_CCD;
1491 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1492 	lp->d_rpm = 3600;
1493 	lp->d_interleave = 1;
1494 	lp->d_flags = 0;
1495 
1496 	lp->d_partitions[RAW_PART].p_offset = 0;
1497 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1498 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1499 	lp->d_npartitions = RAW_PART + 1;
1500 
1501 	lp->d_magic = DISKMAGIC;
1502 	lp->d_magic2 = DISKMAGIC;
1503 	lp->d_checksum = dkcksum(cs->sc_dkdev.dk_label);
1504 }
1505 
1506 /*
1507  * Read the disklabel from the ccd.  If one is not present, fake one
1508  * up.
1509  */
1510 static void
1511 ccdgetdisklabel(dev_t dev)
1512 {
1513 	int unit = ccdunit(dev);
1514 	struct ccd_softc *cs;
1515 	const char *errstring;
1516 	struct disklabel *lp;
1517 	struct cpu_disklabel *clp;
1518 
1519 	if ((cs = ccdget(unit)) == NULL)
1520 		return;
1521 	lp = cs->sc_dkdev.dk_label;
1522 	clp = cs->sc_dkdev.dk_cpulabel;
1523 	KASSERT(mutex_owned(&cs->sc_dvlock));
1524 
1525 	memset(clp, 0, sizeof(*clp));
1526 
1527 	ccdgetdefaultlabel(cs, lp);
1528 
1529 	/*
1530 	 * Call the generic disklabel extraction routine.
1531 	 */
1532 	cs->sc_flags |= CCDF_RLABEL;
1533 	if ((cs->sc_flags & CCDF_NOLABEL) != 0)
1534 		errstring = "CCDF_NOLABEL set; ignoring on-disk label";
1535 	else
1536 		errstring = readdisklabel(CCDLABELDEV(dev), ccdstrategy,
1537 		    cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel);
1538 	if (errstring)
1539 		ccdmakedisklabel(cs);
1540 	else {
1541 		int i;
1542 		struct partition *pp;
1543 
1544 		/*
1545 		 * Sanity check whether the found disklabel is valid.
1546 		 *
1547 		 * This is necessary since total size of ccd may vary
1548 		 * when an interleave is changed even though exactly
1549 		 * same componets are used, and old disklabel may used
1550 		 * if that is found.
1551 		 */
1552 		if (lp->d_secperunit != cs->sc_size)
1553 			printf("WARNING: %s: "
1554 			    "total sector size in disklabel (%ju) != "
1555 			    "the size of ccd (%ju)\n", cs->sc_xname,
1556 			    (uintmax_t)lp->d_secperunit,
1557 			    (uintmax_t)cs->sc_size);
1558 		for (i = 0; i < lp->d_npartitions; i++) {
1559 			pp = &lp->d_partitions[i];
1560 			if (pp->p_offset + pp->p_size > cs->sc_size)
1561 				printf("WARNING: %s: end of partition `%c' "
1562 				    "exceeds the size of ccd (%ju)\n",
1563 				    cs->sc_xname, 'a' + i, (uintmax_t)cs->sc_size);
1564 		}
1565 	}
1566 
1567 #ifdef DEBUG
1568 	/* It's actually extremely common to have unlabeled ccds. */
1569 	if (ccddebug & CCDB_LABEL)
1570 		if (errstring != NULL)
1571 			printf("%s: %s\n", cs->sc_xname, errstring);
1572 #endif
1573 
1574 	/* In-core label now valid. */
1575 	cs->sc_flags = (cs->sc_flags | CCDF_VLABEL) & ~CCDF_RLABEL;
1576 }
1577 
1578 /*
1579  * Take care of things one might want to take care of in the event
1580  * that a disklabel isn't present.
1581  */
1582 static void
1583 ccdmakedisklabel(struct ccd_softc *cs)
1584 {
1585 	struct disklabel *lp = cs->sc_dkdev.dk_label;
1586 
1587 	/*
1588 	 * For historical reasons, if there's no disklabel present
1589 	 * the raw partition must be marked FS_BSDFFS.
1590 	 */
1591 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1592 
1593 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1594 
1595 	lp->d_checksum = dkcksum(lp);
1596 }
1597 
1598 #ifdef DEBUG
1599 static void
1600 printiinfo(struct ccdiinfo *ii)
1601 {
1602 	int ix, i;
1603 
1604 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1605 		printf(" itab[%d]: #dk %d sblk %" PRId64 " soff %" PRId64,
1606 		    ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1607 		for (i = 0; i < ii->ii_ndisk; i++)
1608 			printf(" %d", ii->ii_index[i]);
1609 		printf("\n");
1610 	}
1611 }
1612 #endif
1613 
1614 MODULE(MODULE_CLASS_DRIVER, ccd, "dk_subr");
1615 
1616 static int
1617 ccd_modcmd(modcmd_t cmd, void *arg)
1618 {
1619 	int error = 0;
1620 #ifdef _MODULE
1621 	int bmajor = -1, cmajor = -1;
1622 #endif
1623 
1624 
1625 	switch (cmd) {
1626 	case MODULE_CMD_INIT:
1627 #ifdef _MODULE
1628 		ccdattach(4);
1629 
1630 		return devsw_attach("ccd", &ccd_bdevsw, &bmajor,
1631 		    &ccd_cdevsw, &cmajor);
1632 #endif
1633 		break;
1634 
1635 	case MODULE_CMD_FINI:
1636 #ifdef _MODULE
1637 		return devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
1638 #endif
1639 		break;
1640 
1641 	case MODULE_CMD_STAT:
1642 		return ENOTTY;
1643 
1644 	default:
1645 		return ENOTTY;
1646 	}
1647 
1648 	return error;
1649 }
1650 
1651 static int
1652 ccd_units_sysctl(SYSCTLFN_ARGS)
1653 {
1654 	struct sysctlnode node;
1655 	struct ccd_softc *sc;
1656 	int error, i, nccd, *units;
1657 	size_t size;
1658 
1659 	nccd = 0;
1660 	mutex_enter(&ccd_lock);
1661 	LIST_FOREACH(sc, &ccds, sc_link)
1662 		nccd++;
1663 	mutex_exit(&ccd_lock);
1664 
1665 	if (nccd != 0) {
1666 		size = nccd * sizeof(*units);
1667 		units = kmem_zalloc(size, KM_SLEEP);
1668 		if (units == NULL)
1669 			return ENOMEM;
1670 
1671 		i = 0;
1672 		mutex_enter(&ccd_lock);
1673 		LIST_FOREACH(sc, &ccds, sc_link) {
1674 			if (i >= nccd)
1675 				break;
1676 			units[i] = sc->sc_unit;
1677 		}
1678 		mutex_exit(&ccd_lock);
1679 	} else {
1680 		units = NULL;
1681 		size = 0;
1682 	}
1683 
1684 	node = *rnode;
1685 	node.sysctl_data = units;
1686 	node.sysctl_size = size;
1687 
1688 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1689 	if (units)
1690 		kmem_free(units, size);
1691 	return error;
1692 }
1693 
1694 static int
1695 ccd_info_sysctl(SYSCTLFN_ARGS)
1696 {
1697 	struct sysctlnode node;
1698 	struct ccddiskinfo ccd;
1699 	struct ccd_softc *sc;
1700 	int unit;
1701 
1702 	if (newp == NULL || newlen != sizeof(int))
1703 		return EINVAL;
1704 
1705 	unit = *(const int *)newp;
1706 	newp = NULL;
1707 	newlen = 0;
1708 	ccd.ccd_ndisks = ~0;
1709 	mutex_enter(&ccd_lock);
1710 	LIST_FOREACH(sc, &ccds, sc_link) {
1711 		if (sc->sc_unit == unit) {
1712 			ccd.ccd_ileave = sc->sc_ileave;
1713 			ccd.ccd_size = sc->sc_size;
1714 			ccd.ccd_ndisks = sc->sc_nccdisks;
1715 			ccd.ccd_flags = sc->sc_flags;
1716 			break;
1717 		}
1718 	}
1719 	mutex_exit(&ccd_lock);
1720 
1721 	if (ccd.ccd_ndisks == ~0)
1722 		return ENOENT;
1723 
1724 	node = *rnode;
1725 	node.sysctl_data = &ccd;
1726 	node.sysctl_size = sizeof(ccd);
1727 
1728 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1729 }
1730 
1731 static int
1732 ccd_components_sysctl(SYSCTLFN_ARGS)
1733 {
1734 	struct sysctlnode node;
1735 	int error, unit;
1736 	size_t size;
1737 	char *names, *p, *ep;
1738 	struct ccd_softc *sc;
1739 
1740 	if (newp == NULL || newlen != sizeof(int))
1741 		return EINVAL;
1742 
1743 	size = 0;
1744 	unit = *(const int *)newp;
1745 	newp = NULL;
1746 	newlen = 0;
1747 	mutex_enter(&ccd_lock);
1748 	LIST_FOREACH(sc, &ccds, sc_link)
1749 		if (sc->sc_unit == unit) {
1750 			for (size_t i = 0; i < sc->sc_nccdisks; i++)
1751 				size += strlen(sc->sc_cinfo[i].ci_path) + 1;
1752 			break;
1753 		}
1754 	mutex_exit(&ccd_lock);
1755 
1756 	if (size == 0)
1757 		return ENOENT;
1758 	names = kmem_zalloc(size, KM_SLEEP);
1759 	if (names == NULL)
1760 		return ENOMEM;
1761 
1762 	p = names;
1763 	ep = names + size;
1764 	mutex_enter(&ccd_lock);
1765 	LIST_FOREACH(sc, &ccds, sc_link)
1766 		if (sc->sc_unit == unit) {
1767 			for (size_t i = 0; i < sc->sc_nccdisks; i++) {
1768 				char *d = sc->sc_cinfo[i].ci_path;
1769 				while (p < ep && (*p++ = *d++) != '\0')
1770 					continue;
1771 			}
1772 			break;
1773 		}
1774 	mutex_exit(&ccd_lock);
1775 
1776 	node = *rnode;
1777 	node.sysctl_data = names;
1778 	node.sysctl_size = ep - names;
1779 
1780 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1781 	kmem_free(names, size);
1782 	return error;
1783 }
1784 
1785 SYSCTL_SETUP(sysctl_kern_ccd_setup, "sysctl kern.ccd subtree setup")
1786 {
1787 	const struct sysctlnode *node = NULL;
1788 
1789 	sysctl_createv(clog, 0, NULL, &node,
1790 	    CTLFLAG_PERMANENT,
1791 	    CTLTYPE_NODE, "ccd",
1792 	    SYSCTL_DESCR("ConCatenated Disk state"),
1793 	    NULL, 0, NULL, 0,
1794 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1795 
1796 	if (node == NULL)
1797 		return;
1798 
1799 	sysctl_createv(clog, 0, &node, NULL,
1800 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
1801 	    CTLTYPE_STRUCT, "units",
1802 	    SYSCTL_DESCR("List of ccd unit numbers"),
1803 	    ccd_units_sysctl, 0, NULL, 0,
1804 	    CTL_CREATE, CTL_EOL);
1805 	sysctl_createv(clog, 0, &node, NULL,
1806 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1807 	    CTLTYPE_STRUCT, "info",
1808 	    SYSCTL_DESCR("Information about a CCD unit"),
1809 	    ccd_info_sysctl, 0, NULL, 0,
1810 	    CTL_CREATE, CTL_EOL);
1811 	sysctl_createv(clog, 0, &node, NULL,
1812 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1813 	    CTLTYPE_STRUCT, "components",
1814 	    SYSCTL_DESCR("Information about CCD components"),
1815 	    ccd_components_sysctl, 0, NULL, 0,
1816 	    CTL_CREATE, CTL_EOL);
1817 }
1818