xref: /netbsd-src/sys/dev/ata/ld_ataraid.c (revision 8b0f9554ff8762542c4defc4f70e1eb76fb508fa)
1 /*	$NetBSD: ld_ataraid.c,v 1.22 2007/11/26 19:01:36 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2003 Wasabi Systems, Inc.
5  * All rights reserved.
6  *
7  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project by
20  *	Wasabi Systems, Inc.
21  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22  *    or promote products derived from this software without specific prior
23  *    written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Support for ATA RAID logical disks.
40  *
41  * Note that all the RAID happens in software here; the ATA RAID
42  * controllers we're dealing with (Promise, etc.) only support
43  * configuration data on the component disks, with the BIOS supporting
44  * booting from the RAID volumes.
45  */
46 
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.22 2007/11/26 19:01:36 pooka Exp $");
49 
50 #include "rnd.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/conf.h>
55 #include <sys/kernel.h>
56 #include <sys/device.h>
57 #include <sys/buf.h>
58 #include <sys/bufq.h>
59 #include <sys/dkio.h>
60 #include <sys/disk.h>
61 #include <sys/disklabel.h>
62 #include <sys/fcntl.h>
63 #include <sys/malloc.h>
64 #include <sys/vnode.h>
65 #include <sys/kauth.h>
66 #if NRND > 0
67 #include <sys/rnd.h>
68 #endif
69 
70 #include <miscfs/specfs/specdev.h>
71 
72 #include <dev/ldvar.h>
73 
74 #include <dev/ata/ata_raidvar.h>
75 
76 struct ld_ataraid_softc {
77 	struct ld_softc sc_ld;
78 
79 	struct ataraid_array_info *sc_aai;
80 	struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
81 
82 	void	(*sc_iodone)(struct buf *);
83 };
84 
85 static int	ld_ataraid_match(struct device *, struct cfdata *, void *);
86 static void	ld_ataraid_attach(struct device *, struct device *, void *);
87 
88 static int	ld_ataraid_dump(struct ld_softc *, void *, int, int);
89 
90 static int	ld_ataraid_start_span(struct ld_softc *, struct buf *);
91 
92 static int	ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
93 static void	ld_ataraid_iodone_raid0(struct buf *);
94 
95 CFATTACH_DECL(ld_ataraid, sizeof(struct ld_ataraid_softc),
96     ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
97 
98 static int ld_ataraid_initialized;
99 static struct pool ld_ataraid_cbufpl;
100 
101 struct cbuf {
102 	struct buf	cb_buf;		/* new I/O buf */
103 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
104 	struct ld_ataraid_softc *cb_sc;	/* pointer to ld softc */
105 	u_int		cb_comp;	/* target component */
106 	SIMPLEQ_ENTRY(cbuf) cb_q;	/* fifo of component buffers */
107 	struct cbuf	*cb_other;	/* other cbuf in case of mirror */
108 	int		cb_flags;
109 #define	CBUF_IODONE	0x00000001	/* I/O is already successfully done */
110 };
111 
112 #define	CBUF_GET()	pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
113 #define	CBUF_PUT(cbp)	pool_put(&ld_ataraid_cbufpl, (cbp))
114 
115 static int
116 ld_ataraid_match(struct device *parent,
117     struct cfdata *match, void *aux)
118 {
119 
120 	return (1);
121 }
122 
123 static void
124 ld_ataraid_attach(struct device *parent, struct device *self,
125     void *aux)
126 {
127 	struct ld_ataraid_softc *sc = (void *) self;
128 	struct ld_softc *ld = &sc->sc_ld;
129 	struct ataraid_array_info *aai = aux;
130 	const char *level;
131 	struct vnode *vp;
132 	char unklev[32];
133 	u_int i;
134 
135 	if (ld_ataraid_initialized == 0) {
136 		ld_ataraid_initialized = 1;
137 		pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
138 		    0, 0, "ldcbuf", NULL, IPL_BIO);
139 	}
140 
141 	sc->sc_aai = aai;	/* this data persists */
142 
143 	ld->sc_maxxfer = MAXPHYS * aai->aai_width;	/* XXX */
144 	ld->sc_secperunit = aai->aai_capacity;
145 	ld->sc_secsize = 512;				/* XXX */
146 	ld->sc_maxqueuecnt = 128;			/* XXX */
147 	ld->sc_dump = ld_ataraid_dump;
148 
149 	switch (aai->aai_level) {
150 	case AAI_L_SPAN:
151 		level = "SPAN";
152 		ld->sc_start = ld_ataraid_start_span;
153 		sc->sc_iodone = ld_ataraid_iodone_raid0;
154 		break;
155 
156 	case AAI_L_RAID0:
157 		level = "RAID-0";
158 		ld->sc_start = ld_ataraid_start_raid0;
159 		sc->sc_iodone = ld_ataraid_iodone_raid0;
160 		break;
161 
162 	case AAI_L_RAID1:
163 		level = "RAID-1";
164 		ld->sc_start = ld_ataraid_start_raid0;
165 		sc->sc_iodone = ld_ataraid_iodone_raid0;
166 		break;
167 
168 	case AAI_L_RAID0 | AAI_L_RAID1:
169 		level = "RAID-10";
170 		ld->sc_start = ld_ataraid_start_raid0;
171 		sc->sc_iodone = ld_ataraid_iodone_raid0;
172 		break;
173 
174 	default:
175 		snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
176 		    aai->aai_level);
177 		level = unklev;
178 	}
179 
180 	aprint_naive(": ATA %s array\n", level);
181 	aprint_normal(": %s ATA %s array\n",
182 	    ata_raid_type_name(aai->aai_type), level);
183 
184 	if (ld->sc_start == NULL) {
185 		aprint_error("%s: unsupported array type\n",
186 		    ld->sc_dv.dv_xname);
187 		return;
188 	}
189 
190 	/*
191 	 * We get a geometry from the device; use it.
192 	 */
193 	ld->sc_nheads = aai->aai_heads;
194 	ld->sc_nsectors = aai->aai_sectors;
195 	ld->sc_ncylinders = aai->aai_cylinders;
196 
197 	/*
198 	 * Configure all the component disks.
199 	 */
200 	for (i = 0; i < aai->aai_ndisks; i++) {
201 		struct ataraid_disk_info *adi = &aai->aai_disks[i];
202 		int bmajor, error;
203 		dev_t dev;
204 
205 		bmajor = devsw_name2blk(adi->adi_dev->dv_xname, NULL, 0);
206 		dev = MAKEDISKDEV(bmajor, device_unit(adi->adi_dev), RAW_PART);
207 		error = bdevvp(dev, &vp);
208 		if (error)
209 			break;
210 		error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED);
211 		if (error) {
212 			vput(vp);
213 			/*
214 			 * XXX This is bogus.  We should just mark the
215 			 * XXX component as FAILED, and write-back new
216 			 * XXX config blocks.
217 			 */
218 			break;
219 		}
220 
221 		VOP_UNLOCK(vp, 0);
222 		sc->sc_vnodes[i] = vp;
223 	}
224 	if (i == aai->aai_ndisks) {
225 		ld->sc_flags = LDF_ENABLED;
226 		goto finish;
227 	}
228 
229 	for (i = 0; i < aai->aai_ndisks; i++) {
230 		vp = sc->sc_vnodes[i];
231 		sc->sc_vnodes[i] = NULL;
232 		if (vp != NULL)
233 			(void) vn_close(vp, FREAD|FWRITE, NOCRED, curlwp);
234 	}
235 
236  finish:
237 	ldattach(ld);
238 }
239 
240 static struct cbuf *
241 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
242     u_int comp, daddr_t bn, void *addr, long bcount)
243 {
244 	struct cbuf *cbp;
245 
246 	cbp = CBUF_GET();
247 	if (cbp == NULL)
248 		return (NULL);
249 	BUF_INIT(&cbp->cb_buf);
250 	cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
251 	cbp->cb_buf.b_iodone = sc->sc_iodone;
252 	cbp->cb_buf.b_proc = bp->b_proc;
253 	cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
254 	cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
255 	cbp->cb_buf.b_data = addr;
256 	cbp->cb_buf.b_bcount = bcount;
257 
258 	/* Context for iodone */
259 	cbp->cb_obp = bp;
260 	cbp->cb_sc = sc;
261 	cbp->cb_comp = comp;
262 	cbp->cb_other = NULL;
263 	cbp->cb_flags = 0;
264 
265 	return (cbp);
266 }
267 
268 static int
269 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
270 {
271 	struct ld_ataraid_softc *sc = (void *) ld;
272 	struct ataraid_array_info *aai = sc->sc_aai;
273 	struct ataraid_disk_info *adi;
274 	SIMPLEQ_HEAD(, cbuf) cbufq;
275 	struct cbuf *cbp;
276 	char *addr;
277 	daddr_t bn;
278 	long bcount, rcount;
279 	u_int comp;
280 
281 	/* Allocate component buffers. */
282 	SIMPLEQ_INIT(&cbufq);
283 	addr = bp->b_data;
284 
285 	/* Find the first component. */
286 	comp = 0;
287 	adi = &aai->aai_disks[comp];
288 	bn = bp->b_rawblkno;
289 	while (bn >= adi->adi_compsize) {
290 		bn -= adi->adi_compsize;
291 		adi = &aai->aai_disks[++comp];
292 	}
293 
294 	bp->b_resid = bp->b_bcount;
295 
296 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
297 		rcount = bp->b_bcount;
298 		if ((adi->adi_compsize - bn) < btodb(rcount))
299 			rcount = dbtob(adi->adi_compsize - bn);
300 
301 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
302 		if (cbp == NULL) {
303 			/* Free the already allocated component buffers. */
304 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
305 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
306 				CBUF_PUT(cbp);
307 			}
308 			return (EAGAIN);
309 		}
310 
311 		/*
312 		 * For a span, we always know we advance to the next disk,
313 		 * and always start at offset 0 on that disk.
314 		 */
315 		adi = &aai->aai_disks[++comp];
316 		bn = 0;
317 
318 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
319 		addr += rcount;
320 	}
321 
322 	/* Now fire off the requests. */
323 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
324 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
325 		if ((cbp->cb_buf.b_flags & B_READ) == 0)
326 			cbp->cb_buf.b_vp->v_numoutput++;
327 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
328 	}
329 
330 	return (0);
331 }
332 
333 static int
334 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
335 {
336 	struct ld_ataraid_softc *sc = (void *) ld;
337 	struct ataraid_array_info *aai = sc->sc_aai;
338 	struct ataraid_disk_info *adi;
339 	SIMPLEQ_HEAD(, cbuf) cbufq;
340 	struct cbuf *cbp, *other_cbp;
341 	char *addr;
342 	daddr_t bn, cbn, tbn, off;
343 	long bcount, rcount;
344 	u_int comp;
345 	const int read = bp->b_flags & B_READ;
346 	const int mirror = aai->aai_level & AAI_L_RAID1;
347 	int error;
348 
349 	/* Allocate component buffers. */
350 	SIMPLEQ_INIT(&cbufq);
351 	addr = bp->b_data;
352 	bn = bp->b_rawblkno;
353 
354 	bp->b_resid = bp->b_bcount;
355 
356 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
357 		tbn = bn / aai->aai_interleave;
358 		off = bn % aai->aai_interleave;
359 
360 		if (__predict_false(tbn == aai->aai_capacity /
361 					   aai->aai_interleave)) {
362 			/* Last stripe. */
363 			daddr_t sz = (aai->aai_capacity -
364 				      (tbn * aai->aai_interleave)) /
365 				     aai->aai_width;
366 			comp = off / sz;
367 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
368 			    (off % sz);
369 			rcount = min(bcount, dbtob(sz));
370 		} else {
371 			comp = tbn % aai->aai_width;
372 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
373 			    off;
374 			rcount = min(bcount, dbtob(aai->aai_interleave - off));
375 		}
376 
377 		/*
378 		 * See if a component is valid.
379 		 */
380 try_mirror:
381 		adi = &aai->aai_disks[comp];
382 		if ((adi->adi_status & ADI_S_ONLINE) == 0) {
383 			if (mirror && comp < aai->aai_width) {
384 				comp += aai->aai_width;
385 				goto try_mirror;
386 			}
387 
388 			/*
389 			 * No component available.
390 			 */
391 			error = EIO;
392 			goto free_and_exit;
393 		}
394 
395 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
396 		if (cbp == NULL) {
397 resource_shortage:
398 			error = EAGAIN;
399 free_and_exit:
400 			/* Free the already allocated component buffers. */
401 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
402 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
403 				CBUF_PUT(cbp);
404 			}
405 			return (error);
406 		}
407 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
408 		if (mirror && !read && comp < aai->aai_width) {
409 			comp += aai->aai_width;
410 			adi = &aai->aai_disks[comp];
411 			if (adi->adi_status & ADI_S_ONLINE) {
412 				other_cbp = ld_ataraid_make_cbuf(sc, bp,
413 				    comp, cbn, addr, rcount);
414 				if (other_cbp == NULL)
415 					goto resource_shortage;
416 				SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
417 				other_cbp->cb_other = cbp;
418 				cbp->cb_other = other_cbp;
419 			}
420 		}
421 		bn += btodb(rcount);
422 		addr += rcount;
423 	}
424 
425 	/* Now fire off the requests. */
426 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
427 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
428 		if ((cbp->cb_buf.b_flags & B_READ) == 0)
429 			cbp->cb_buf.b_vp->v_numoutput++;
430 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
431 	}
432 
433 	return (0);
434 }
435 
436 /*
437  * Called at interrupt time.  Mark the component as done and if all
438  * components are done, take an "interrupt".
439  */
440 static void
441 ld_ataraid_iodone_raid0(struct buf *vbp)
442 {
443 	struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
444 	struct buf *bp = cbp->cb_obp;
445 	struct ld_ataraid_softc *sc = cbp->cb_sc;
446 	struct ataraid_array_info *aai = sc->sc_aai;
447 	struct ataraid_disk_info *adi;
448 	long count;
449 	int s, iodone;
450 
451 	s = splbio();
452 
453 	iodone = cbp->cb_flags & CBUF_IODONE;
454 	other_cbp = cbp->cb_other;
455 	if (other_cbp != NULL)
456 		/* You are alone */
457 		other_cbp->cb_other = NULL;
458 
459 	if (cbp->cb_buf.b_error != 0) {
460 		/*
461 		 * Mark this component broken.
462 		 */
463 		adi = &aai->aai_disks[cbp->cb_comp];
464 		adi->adi_status &= ~ADI_S_ONLINE;
465 
466 		printf("%s: error %d on component %d (%s)\n",
467 		    sc->sc_ld.sc_dv.dv_xname, bp->b_error, cbp->cb_comp,
468 		    adi->adi_dev->dv_xname);
469 
470 		/*
471 		 * If we didn't see an error yet and we are reading
472 		 * RAID1 disk, try another component.
473 		 */
474 		if (bp->b_error == 0 &&
475 		    (cbp->cb_buf.b_flags & B_READ) != 0 &&
476 		    (aai->aai_level & AAI_L_RAID1) != 0 &&
477 		    cbp->cb_comp < aai->aai_width) {
478 			cbp->cb_comp += aai->aai_width;
479 			adi = &aai->aai_disks[cbp->cb_comp];
480 			if (adi->adi_status & ADI_S_ONLINE) {
481 				cbp->cb_buf.b_error = 0;
482 				VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
483 				goto out;
484 			}
485 		}
486 
487 		if (iodone || other_cbp != NULL)
488 			/*
489 			 * If I/O on other component successfully done
490 			 * or the I/O is still in progress, no need
491 			 * to tell an error to upper layer.
492 			 */
493 			;
494 		else {
495 			bp->b_error = cbp->cb_buf.b_error ?
496 			    cbp->cb_buf.b_error : EIO;
497 		}
498 
499 		/* XXX Update component config blocks. */
500 
501 	} else {
502 		/*
503 		 * If other I/O is still in progress, tell it that
504 		 * our I/O is successfully done.
505 		 */
506 		if (other_cbp != NULL)
507 			other_cbp->cb_flags |= CBUF_IODONE;
508 	}
509 	count = cbp->cb_buf.b_bcount;
510 	CBUF_PUT(cbp);
511 
512 	if (other_cbp != NULL)
513 		goto out;
514 
515 	/* If all done, "interrupt". */
516 	bp->b_resid -= count;
517 	if (bp->b_resid < 0)
518 		panic("ld_ataraid_iodone_raid0: count");
519 	if (bp->b_resid == 0)
520 		lddone(&sc->sc_ld, bp);
521 
522 out:
523 	splx(s);
524 }
525 
526 static int
527 ld_ataraid_dump(struct ld_softc *sc, void *data,
528     int blkno, int blkcnt)
529 {
530 
531 	return (EIO);
532 }
533