xref: /netbsd-src/sys/dev/ata/ld_ataraid.c (revision e5548b402ae4c44fb816de42c7bba9581ce23ef5)
1 /*	$NetBSD: ld_ataraid.c,v 1.14 2005/12/11 12:21:14 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 2003 Wasabi Systems, Inc.
5  * All rights reserved.
6  *
7  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project by
20  *	Wasabi Systems, Inc.
21  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22  *    or promote products derived from this software without specific prior
23  *    written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Support for ATA RAID logical disks.
40  *
41  * Note that all the RAID happens in software here; the ATA RAID
42  * controllers we're dealing with (Promise, etc.) only support
43  * configuration data on the component disks, with the BIOS supporting
44  * booting from the RAID volumes.
45  */
46 
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.14 2005/12/11 12:21:14 christos Exp $");
49 
50 #include "rnd.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/conf.h>
55 #include <sys/kernel.h>
56 #include <sys/device.h>
57 #include <sys/buf.h>
58 #include <sys/bufq.h>
59 #include <sys/dkio.h>
60 #include <sys/disk.h>
61 #include <sys/disklabel.h>
62 #include <sys/fcntl.h>
63 #include <sys/malloc.h>
64 #include <sys/vnode.h>
65 #if NRND > 0
66 #include <sys/rnd.h>
67 #endif
68 
69 #include <miscfs/specfs/specdev.h>
70 
71 #include <dev/ldvar.h>
72 
73 #include <dev/ata/ata_raidvar.h>
74 
75 struct ld_ataraid_softc {
76 	struct ld_softc sc_ld;
77 
78 	struct ataraid_array_info *sc_aai;
79 	struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
80 
81 	void	(*sc_iodone)(struct buf *);
82 };
83 
84 static int	ld_ataraid_match(struct device *, struct cfdata *, void *);
85 static void	ld_ataraid_attach(struct device *, struct device *, void *);
86 
87 static int	ld_ataraid_dump(struct ld_softc *, void *, int, int);
88 
89 static int	ld_ataraid_start_span(struct ld_softc *, struct buf *);
90 
91 static int	ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
92 static void	ld_ataraid_iodone_raid0(struct buf *);
93 
94 CFATTACH_DECL(ld_ataraid, sizeof(struct ld_ataraid_softc),
95     ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
96 
97 static int ld_ataraid_initialized;
98 static struct pool ld_ataraid_cbufpl;
99 
100 struct cbuf {
101 	struct buf	cb_buf;		/* new I/O buf */
102 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
103 	struct ld_ataraid_softc *cb_sc;	/* pointer to ld softc */
104 	u_int		cb_comp;	/* target component */
105 	SIMPLEQ_ENTRY(cbuf) cb_q;	/* fifo of component buffers */
106 	struct cbuf	*cb_other;	/* other cbuf in case of mirror */
107 	int		cb_flags;
108 #define	CBUF_IODONE	0x00000001	/* I/O is already successfully done */
109 };
110 
111 #define	CBUF_GET()	pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
112 #define	CBUF_PUT(cbp)	pool_put(&ld_ataraid_cbufpl, (cbp))
113 
114 static int
115 ld_ataraid_match(struct device *parent, struct cfdata *match, void *aux)
116 {
117 
118 	return (1);
119 }
120 
121 static void
122 ld_ataraid_attach(struct device *parent, struct device *self, void *aux)
123 {
124 	struct ld_ataraid_softc *sc = (void *) self;
125 	struct ld_softc *ld = &sc->sc_ld;
126 	struct ataraid_array_info *aai = aux;
127 	const char *level;
128 	struct vnode *vp;
129 	char unklev[32];
130 	u_int i;
131 
132 	if (ld_ataraid_initialized == 0) {
133 		ld_ataraid_initialized = 1;
134 		pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
135 		    0, 0, "ldcbuf", NULL);
136 	}
137 
138 	sc->sc_aai = aai;	/* this data persists */
139 
140 	ld->sc_maxxfer = MAXPHYS * aai->aai_width;	/* XXX */
141 	ld->sc_secperunit = aai->aai_capacity;
142 	ld->sc_secsize = 512;				/* XXX */
143 	ld->sc_maxqueuecnt = 128;			/* XXX */
144 	ld->sc_dump = ld_ataraid_dump;
145 
146 	switch (aai->aai_level) {
147 	case AAI_L_SPAN:
148 		level = "SPAN";
149 		ld->sc_start = ld_ataraid_start_span;
150 		sc->sc_iodone = ld_ataraid_iodone_raid0;
151 		break;
152 
153 	case AAI_L_RAID0:
154 		level = "RAID-0";
155 		ld->sc_start = ld_ataraid_start_raid0;
156 		sc->sc_iodone = ld_ataraid_iodone_raid0;
157 		break;
158 
159 	case AAI_L_RAID1:
160 		level = "RAID-1";
161 		ld->sc_start = ld_ataraid_start_raid0;
162 		sc->sc_iodone = ld_ataraid_iodone_raid0;
163 		break;
164 
165 	case AAI_L_RAID0 | AAI_L_RAID1:
166 		level = "RAID-10";
167 		ld->sc_start = ld_ataraid_start_raid0;
168 		sc->sc_iodone = ld_ataraid_iodone_raid0;
169 		break;
170 
171 	default:
172 		snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
173 		    aai->aai_level);
174 		level = unklev;
175 	}
176 
177 	aprint_naive(": ATA %s array\n", level);
178 	aprint_normal(": %s ATA %s array\n",
179 	    ata_raid_type_name(aai->aai_type), level);
180 
181 	if (ld->sc_start == NULL) {
182 		aprint_error("%s: unsupported array type\n",
183 		    ld->sc_dv.dv_xname);
184 		return;
185 	}
186 
187 	/*
188 	 * We get a geometry from the device; use it.
189 	 */
190 	ld->sc_nheads = aai->aai_heads;
191 	ld->sc_nsectors = aai->aai_sectors;
192 	ld->sc_ncylinders = aai->aai_cylinders;
193 
194 	/*
195 	 * Configure all the component disks.
196 	 */
197 	for (i = 0; i < aai->aai_ndisks; i++) {
198 		struct ataraid_disk_info *adi = &aai->aai_disks[i];
199 		int bmajor, error;
200 		dev_t dev;
201 
202 		bmajor = devsw_name2blk(adi->adi_dev->dv_xname, NULL, 0);
203 		dev = MAKEDISKDEV(bmajor, adi->adi_dev->dv_unit, RAW_PART);
204 		error = bdevvp(dev, &vp);
205 		if (error)
206 			break;
207 		error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED, 0);
208 		if (error) {
209 			vput(vp);
210 			/*
211 			 * XXX This is bogus.  We should just mark the
212 			 * XXX component as FAILED, and write-back new
213 			 * XXX config blocks.
214 			 */
215 			break;
216 		}
217 
218 		VOP_UNLOCK(vp, 0);
219 		sc->sc_vnodes[i] = vp;
220 	}
221 	if (i == aai->aai_ndisks) {
222 		ld->sc_flags = LDF_ENABLED;
223 		goto finish;
224 	}
225 
226 	for (i = 0; i < aai->aai_ndisks; i++) {
227 		vp = sc->sc_vnodes[i];
228 		sc->sc_vnodes[i] = NULL;
229 		if (vp != NULL)
230 			(void) vn_close(vp, FREAD|FWRITE, NOCRED, curlwp);
231 	}
232 
233  finish:
234 	ldattach(ld);
235 }
236 
237 static struct cbuf *
238 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
239     u_int comp, daddr_t bn, caddr_t addr, long bcount)
240 {
241 	struct cbuf *cbp;
242 
243 	cbp = CBUF_GET();
244 	if (cbp == NULL)
245 		return (NULL);
246 	BUF_INIT(&cbp->cb_buf);
247 	cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
248 	cbp->cb_buf.b_iodone = sc->sc_iodone;
249 	cbp->cb_buf.b_proc = bp->b_proc;
250 	cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
251 	cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
252 	cbp->cb_buf.b_data = addr;
253 	cbp->cb_buf.b_bcount = bcount;
254 
255 	/* Context for iodone */
256 	cbp->cb_obp = bp;
257 	cbp->cb_sc = sc;
258 	cbp->cb_comp = comp;
259 	cbp->cb_other = NULL;
260 	cbp->cb_flags = 0;
261 
262 	return (cbp);
263 }
264 
265 static int
266 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
267 {
268 	struct ld_ataraid_softc *sc = (void *) ld;
269 	struct ataraid_array_info *aai = sc->sc_aai;
270 	struct ataraid_disk_info *adi;
271 	SIMPLEQ_HEAD(, cbuf) cbufq;
272 	struct cbuf *cbp;
273 	caddr_t addr;
274 	daddr_t bn;
275 	long bcount, rcount;
276 	u_int comp;
277 
278 	/* Allocate component buffers. */
279 	SIMPLEQ_INIT(&cbufq);
280 	addr = bp->b_data;
281 
282 	/* Find the first component. */
283 	comp = 0;
284 	adi = &aai->aai_disks[comp];
285 	bn = bp->b_rawblkno;
286 	while (bn >= adi->adi_compsize) {
287 		bn -= adi->adi_compsize;
288 		adi = &aai->aai_disks[++comp];
289 	}
290 
291 	bp->b_resid = bp->b_bcount;
292 
293 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
294 		rcount = bp->b_bcount;
295 		if ((adi->adi_compsize - bn) < btodb(rcount))
296 			rcount = dbtob(adi->adi_compsize - bn);
297 
298 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
299 		if (cbp == NULL) {
300 			/* Free the already allocated component buffers. */
301 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
302 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
303 				CBUF_PUT(cbp);
304 			}
305 			return (EAGAIN);
306 		}
307 
308 		/*
309 		 * For a span, we always know we advance to the next disk,
310 		 * and always start at offset 0 on that disk.
311 		 */
312 		adi = &aai->aai_disks[++comp];
313 		bn = 0;
314 
315 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
316 		addr += rcount;
317 	}
318 
319 	/* Now fire off the requests. */
320 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
321 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
322 		if ((cbp->cb_buf.b_flags & B_READ) == 0)
323 			cbp->cb_buf.b_vp->v_numoutput++;
324 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
325 	}
326 
327 	return (0);
328 }
329 
330 static int
331 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
332 {
333 	struct ld_ataraid_softc *sc = (void *) ld;
334 	struct ataraid_array_info *aai = sc->sc_aai;
335 	struct ataraid_disk_info *adi;
336 	SIMPLEQ_HEAD(, cbuf) cbufq;
337 	struct cbuf *cbp, *other_cbp;
338 	caddr_t addr;
339 	daddr_t bn, cbn, tbn, off;
340 	long bcount, rcount;
341 	u_int comp;
342 	const int read = bp->b_flags & B_READ;
343 	const int mirror = aai->aai_level & AAI_L_RAID1;
344 	int error;
345 
346 	/* Allocate component buffers. */
347 	SIMPLEQ_INIT(&cbufq);
348 	addr = bp->b_data;
349 	bn = bp->b_rawblkno;
350 
351 	bp->b_resid = bp->b_bcount;
352 
353 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
354 		tbn = bn / aai->aai_interleave;
355 		off = bn % aai->aai_interleave;
356 
357 		if (__predict_false(tbn == aai->aai_capacity /
358 					   aai->aai_interleave)) {
359 			/* Last stripe. */
360 			daddr_t sz = (aai->aai_capacity -
361 				      (tbn * aai->aai_interleave)) /
362 				     aai->aai_width;
363 			comp = off / sz;
364 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
365 			    (off % sz);
366 			rcount = min(bcount, dbtob(sz));
367 		} else {
368 			comp = tbn % aai->aai_width;
369 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
370 			    off;
371 			rcount = min(bcount, dbtob(aai->aai_interleave - off));
372 		}
373 
374 		/*
375 		 * See if a component is valid.
376 		 */
377 try_mirror:
378 		adi = &aai->aai_disks[comp];
379 		if ((adi->adi_status & ADI_S_ONLINE) == 0) {
380 			if (mirror && comp < aai->aai_width) {
381 				comp += aai->aai_width;
382 				goto try_mirror;
383 			}
384 
385 			/*
386 			 * No component available.
387 			 */
388 			error = EIO;
389 			goto free_and_exit;
390 		}
391 
392 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
393 		if (cbp == NULL) {
394 resource_shortage:
395 			error = EAGAIN;
396 free_and_exit:
397 			/* Free the already allocated component buffers. */
398 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
399 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
400 				CBUF_PUT(cbp);
401 			}
402 			return (error);
403 		}
404 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
405 		if (mirror && !read && comp < aai->aai_width) {
406 			comp += aai->aai_width;
407 			adi = &aai->aai_disks[comp];
408 			if (adi->adi_status & ADI_S_ONLINE) {
409 				other_cbp = ld_ataraid_make_cbuf(sc, bp,
410 				    comp, cbn, addr, rcount);
411 				if (other_cbp == NULL)
412 					goto resource_shortage;
413 				SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
414 				other_cbp->cb_other = cbp;
415 				cbp->cb_other = other_cbp;
416 			}
417 		}
418 		bn += btodb(rcount);
419 		addr += rcount;
420 	}
421 
422 	/* Now fire off the requests. */
423 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
424 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
425 		if ((cbp->cb_buf.b_flags & B_READ) == 0)
426 			cbp->cb_buf.b_vp->v_numoutput++;
427 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
428 	}
429 
430 	return (0);
431 }
432 
433 /*
434  * Called at interrupt time.  Mark the component as done and if all
435  * components are done, take an "interrupt".
436  */
437 static void
438 ld_ataraid_iodone_raid0(struct buf *vbp)
439 {
440 	struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
441 	struct buf *bp = cbp->cb_obp;
442 	struct ld_ataraid_softc *sc = cbp->cb_sc;
443 	struct ataraid_array_info *aai = sc->sc_aai;
444 	struct ataraid_disk_info *adi;
445 	long count;
446 	int s, iodone;
447 
448 	s = splbio();
449 
450 	iodone = cbp->cb_flags & CBUF_IODONE;
451 	other_cbp = cbp->cb_other;
452 	if (other_cbp != NULL)
453 		/* You are alone */
454 		other_cbp->cb_other = NULL;
455 
456 	if (cbp->cb_buf.b_flags & B_ERROR) {
457 		/*
458 		 * Mark this component broken.
459 		 */
460 		adi = &aai->aai_disks[cbp->cb_comp];
461 		adi->adi_status &= ~ADI_S_ONLINE;
462 
463 		printf("%s: error %d on component %d (%s)\n",
464 		    sc->sc_ld.sc_dv.dv_xname, bp->b_error, cbp->cb_comp,
465 		    adi->adi_dev->dv_xname);
466 
467 		/*
468 		 * If we didn't see an error yet and we are reading
469 		 * RAID1 disk, try another component.
470 		 */
471 		if ((bp->b_flags & B_ERROR) == 0 &&
472 		    (cbp->cb_buf.b_flags & B_READ) != 0 &&
473 		    (aai->aai_level & AAI_L_RAID1) != 0 &&
474 		    cbp->cb_comp < aai->aai_width) {
475 			cbp->cb_comp += aai->aai_width;
476 			adi = &aai->aai_disks[cbp->cb_comp];
477 			if (adi->adi_status & ADI_S_ONLINE) {
478 				cbp->cb_buf.b_flags &= ~B_ERROR;
479 				VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
480 				goto out;
481 			}
482 		}
483 
484 		if (iodone || other_cbp != NULL)
485 			/*
486 			 * If I/O on other component successfully done
487 			 * or the I/O is still in progress, no need
488 			 * to tell an error to upper layer.
489 			 */
490 			;
491 		else {
492 			bp->b_flags |= B_ERROR;
493 			bp->b_error = cbp->cb_buf.b_error ?
494 			    cbp->cb_buf.b_error : EIO;
495 		}
496 
497 		/* XXX Update component config blocks. */
498 
499 	} else {
500 		/*
501 		 * If other I/O is still in progress, tell it that
502 		 * our I/O is successfully done.
503 		 */
504 		if (other_cbp != NULL)
505 			other_cbp->cb_flags |= CBUF_IODONE;
506 	}
507 	count = cbp->cb_buf.b_bcount;
508 	CBUF_PUT(cbp);
509 
510 	if (other_cbp != NULL)
511 		goto out;
512 
513 	/* If all done, "interrupt". */
514 	bp->b_resid -= count;
515 	if (bp->b_resid < 0)
516 		panic("ld_ataraid_iodone_raid0: count");
517 	if (bp->b_resid == 0)
518 		lddone(&sc->sc_ld, bp);
519 
520 out:
521 	splx(s);
522 }
523 
524 static int
525 ld_ataraid_dump(struct ld_softc *sc, void *data, int blkno, int blkcnt)
526 {
527 
528 	return (EIO);
529 }
530