xref: /plan9/sys/src/9/port/devfs.c (revision 3468a4915d661daa200976acc4f80f51aae144b2)
1 /*
2  * File system devices.
3  * Follows device config in Ken's file server.
4  * Builds mirrors, concatenations, interleavings, and partitions
5  * of devices out of other (inner) devices.
6  * It is ok if inner devices are provided by this driver.
7  *
8  * Built files are grouped on different directories
9  * (called trees, and used to represent disks).
10  * The "#k/fs" tree is always available and never goes away.
11  * Configuration changes happen only while no I/O is in progress.
12  *
13  * Default sector size is one byte unless changed by the "disk" ctl.
14  */
15 
16 #include "u.h"
17 #include "../port/lib.h"
18 #include "mem.h"
19 #include "dat.h"
20 #include "fns.h"
21 #include "io.h"
22 #include "ureg.h"
23 #include "../port/error.h"
24 
25 enum
26 {
27 	Fnone,
28 	Fmirror,		/* mirror of others */
29 	Fcat,			/* catenation of others */
30 	Finter,			/* interleaving of others */
31 	Fpart,			/* part of other */
32 	Fclear,			/* start over */
33 	Fdel,			/* delete a configure device */
34 	Fdisk,			/* set default tree and sector sz*/
35 
36 	Sectorsz = 1,
37 	Blksize	= 8*1024,	/* for Finter only */
38 
39 	Incr = 5,		/* Increments for the dev array */
40 
41 	/*
42 	 * All qids are decorated with the tree number.
43 	 * #k/fs is tree number 0, is automatically added and
44 	 * its first qid is for the ctl file. It never goes away.
45 	 */
46 	Qtop	= 0,		/* #k */
47 	Qdir,			/* directory (#k/fs) */
48 	Qctl,			/* ctl, only for #k/fs/ctl */
49 	Qfirst,			/* first qid assigned for device */
50 
51 	Iswrite = 0,
52 	Isread,
53 
54 	Optional = 0,
55 	Mustexist,
56 
57 	/* tunable parameters */
58 	Maxconf	= 4*1024,	/* max length for config */
59 	Ndevs	= 32,		/* max. inner devs per command */
60 	Ntrees	= 128,		/* max. number of trees */
61 	Maxretries = 3,		/* max. retries of i/o errors */
62 	Retrypause = 5000,	/* ms. to pause between retries */
63 };
64 
65 typedef struct Inner Inner;
66 typedef struct Fsdev Fsdev;
67 typedef struct Tree Tree;
68 
69 struct Inner
70 {
71 	char	*iname;		/* inner device name */
72 	vlong	isize;		/* size of inner device */
73 	Chan	*idev;		/* inner device */
74 };
75 
76 struct Fsdev
77 {
78 	Ref;			/* one per Chan doing I/O */
79 	int	gone;		/* true if removed */
80 	int	vers;		/* qid version for this device */
81 	int	type;		/* Fnone, Fmirror, ... */
82 	char	*name;		/* name for this fsdev */
83 	Tree*	tree;		/* where the device is kept */
84 	vlong	size;		/* min(inner[X].isize) */
85 	vlong	start;		/* start address (for Fpart) */
86 	uint	ndevs;		/* number of inner devices */
87 	Inner	*inner[Ndevs];	/* inner devices */
88 };
89 
90 struct Tree
91 {
92 	char	*name;		/* name for #k/<name> */
93 	Fsdev	**devs;		/* devices in dir. */
94 	uint	ndevs;		/* number of devices */
95 	uint	nadevs;		/* number of allocated devices in devs */
96 };
97 
98 #define dprint if(debug)print
99 
100 extern Dev fsdevtab;		/* forward */
101 
102 static RWlock lck;		/* r: use devices; w: change config  */
103 static Tree fstree;		/* The main "fs" tree. Never goes away */
104 static Tree *trees[Ntrees];	/* internal representation of config */
105 static int ntrees;		/* max number of trees */
106 static int qidvers;
107 static char *disk;		/* default tree name used */
108 static char *source;		/* default inner device used */
109 static int sectorsz = Sectorsz;	/* default sector size */
110 static char confstr[Maxconf];	/* textual configuration */
111 
112 static int debug;
113 
114 static char cfgstr[] = "fsdev:\n";
115 
116 static Qid tqid = {Qtop, 0, QTDIR};
117 static Qid cqid = {Qctl, 0, 0};
118 
119 static char* tnames[] = {
120 	[Fmirror]	"mirror",
121 	[Fcat]		"cat",
122 	[Finter]	"inter",
123 	[Fpart]		"part",
124 };
125 
126 static Cmdtab configs[] = {
127 	Fmirror,"mirror",	0,
128 	Fcat,	"cat",		0,
129 	Finter,	"inter",	0,
130 	Fpart,	"part",		0,
131 	Fclear,	"clear",	1,
132 	Fdel,	"del",		2,
133 	Fdisk,	"disk",		0,
134 };
135 
136 static char Egone[] = "device is gone";		/* file has been removed */
137 
138 static char*
139 seprintdev(char *s, char *e, Fsdev *mp)
140 {
141 	int i;
142 
143 	if(mp == nil)
144 		return seprint(s, e, "<null Fsdev>");
145 	if(mp->type < 0 || mp->type >= nelem(tnames) || tnames[mp->type] == nil)
146 		return seprint(s, e, "bad device type %d\n", mp->type);
147 
148 	s = strecpy(s, e, tnames[mp->type]);
149 	if(mp->tree != &fstree)
150 		s = seprint(s, e, " %s/%s", mp->tree->name, mp->name);
151 	else
152 		s = seprint(s, e, " %s", mp->name);
153 	for(i = 0; i < mp->ndevs; i++)
154 		s = seprint(s, e, " %s", mp->inner[i]->iname);
155 	switch(mp->type){
156 	case Fmirror:
157 	case Fcat:
158 	case Finter:
159 		s = strecpy(s, e, "\n");
160 		break;
161 	case Fpart:
162 		s = seprint(s, e, " %ulld %ulld\n", mp->start, mp->size);
163 		break;
164 	default:
165 		panic("#k: seprintdev bug");
166 	}
167 	return s;
168 }
169 
170 static vlong
171 mkpath(int tree, int devno)
172 {
173 	return (tree&0xFFFF)<<16 | devno&0xFFFF;
174 }
175 
176 static int
177 path2treeno(int q)
178 {
179 	return q>>16 & 0xFFFF;
180 }
181 
182 static int
183 path2devno(int q)
184 {
185 	return q & 0xFFFF;
186 }
187 
188 static Tree*
189 gettree(int i, int mustexist)
190 {
191 	dprint("gettree %d\n", i);
192 	if(i < 0)
193 		panic("#k: bug: bad tree index %d in gettree", i);
194 	if(i >= ntrees || trees[i] == nil)
195 		if(mustexist)
196 			error(Enonexist);
197 		else
198 			return nil;
199 	return trees[i];
200 }
201 
202 static Fsdev*
203 getdev(Tree *t, int i, int mustexist)
204 {
205 	dprint("getdev %d\n", i);
206 	if(i < 0)
207 		panic("#k: bug: bad dev index %d in getdev", i);
208 	if(i >= t->nadevs || t->devs[i] == nil)
209 		if(mustexist)
210 			error(Enonexist);
211 		else
212 			return nil;
213 	return t->devs[i];
214 }
215 
216 static Fsdev*
217 path2dev(int q)
218 {
219 	Tree	*t;
220 
221 	dprint("path2dev %ux\n", q);
222 	t = gettree(path2treeno(q), Mustexist);
223 	return getdev(t, path2devno(q) - Qfirst, Mustexist);
224 }
225 
226 static Tree*
227 treealloc(char *name)
228 {
229 	int	i;
230 	Tree	*t;
231 
232 	dprint("treealloc %s\n", name);
233 	for(i = 0; i < nelem(trees); i++)
234 		if(trees[i] == nil)
235 			break;
236 	if(i == nelem(trees))
237 		return nil;
238 	t = trees[i] = mallocz(sizeof(Tree), 1);
239 	if(t == nil)
240 		return nil;
241 	if(i == ntrees)
242 		ntrees++;
243 	kstrdup(&t->name, name);
244 	return t;
245 }
246 
247 static Tree*
248 lookuptree(char *name)
249 {
250 	int i;
251 
252 	dprint("lookuptree %s\n", name);
253 	for(i = 0; i < ntrees; i++)
254 		if(trees[i] != nil && strcmp(trees[i]->name, name) == 0)
255 			return trees[i];
256 	return nil;
257 }
258 
259 static Fsdev*
260 devalloc(Tree *t, char *name)
261 {
262 	int	i, ndevs;
263 	Fsdev	*mp, **devs;
264 
265 	dprint("devalloc %s %s\n", t->name, name);
266 	mp = mallocz(sizeof(Fsdev), 1);
267 	if(mp == nil)
268 		return nil;
269 	for(i = 0; i < t->nadevs; i++)
270 		if(t->devs[i] == nil)
271 			break;
272 	if(i >= t->nadevs){
273 		if(t->nadevs % Incr == 0){
274 			ndevs = t->nadevs + Incr;
275 			devs = realloc(t->devs, ndevs * sizeof(Fsdev*));
276 			if(devs == nil){
277 				free(mp);
278 				return nil;
279 			}
280 			t->devs = devs;
281 		}
282 		t->devs[t->nadevs] = nil;
283 		t->nadevs++;
284 	}
285 	kstrdup(&mp->name, name);
286 	mp->vers = ++qidvers;
287 	mp->tree = t;
288 	t->devs[i] = mp;
289 	t->ndevs++;
290 	return mp;
291 }
292 
293 static void
294 deltree(Tree *t)
295 {
296 	int i;
297 
298 	dprint("deltree %s\n", t->name);
299 	for(i = 0; i < ntrees; i++)
300 		if(trees[i] == t){
301 			if(i > 0){		/* "fs" never goes away */
302 				free(t->name);
303 				free(t->devs);
304 				free(t);
305 				trees[i] = nil;
306 			}
307 			return;
308 		}
309 	panic("#k: deltree: bug: tree not found");
310 }
311 
312 /*
313  * A device is gone and we know that all its users are gone.
314  * A tree is gone when all its devices are gone ("fs" is never gone).
315  * Must close devices outside locks, so we could nest our own devices.
316  */
317 static void
318 mdeldev(Fsdev *mp)
319 {
320 	int	i;
321 	Inner	*in;
322 	Tree	*t;
323 
324 	dprint("deldev %s gone %d ref %uld\n", mp->name, mp->gone, mp->ref);
325 
326 	mp->gone = 1;
327 	mp->vers = ++qidvers;
328 
329 	wlock(&lck);
330 	t = mp->tree;
331 	for(i = 0; i < t->nadevs; i++)
332 		if(t->devs[i] == mp){
333 			t->devs[i] = nil;
334 			t->ndevs--;
335 			if(t->ndevs == 0)
336 				deltree(t);
337 			break;
338 		}
339 	wunlock(&lck);
340 
341 	free(mp->name);
342 	for(i = 0; i < mp->ndevs; i++){
343 		in = mp->inner[i];
344 		if(in->idev != nil)
345 			cclose(in->idev);
346 		free(in->iname);
347 		free(in);
348 	}
349 	if(debug)
350 		memset(mp, 9, sizeof *mp);	/* poison */
351 	free(mp);
352 }
353 
354 /*
355  * Delete one or all devices in one or all trees.
356  */
357 static void
358 mdelctl(char *tname, char *dname)
359 {
360 	int i, alldevs, alltrees, some;
361 	Fsdev *mp;
362 	Tree *t;
363 
364 	dprint("delctl %s\n", dname);
365 	alldevs = strcmp(dname, "*") == 0;
366 	alltrees = strcmp(tname, "*") == 0;
367 	some = 0;
368 Again:
369 	wlock(&lck);
370 	for(i = 0; i < ntrees; i++){
371 		t = trees[i];
372 		if(t == nil)
373 			continue;
374 		if(alltrees == 0 && strcmp(t->name, tname) != 0)
375 			continue;
376 		for(i = 0; i < t->nadevs; i++){
377 			mp = t->devs[i];
378 			if(t->devs[i] == nil)
379 				continue;
380 			if(alldevs == 0 && strcmp(mp->name, dname) != 0)
381 				continue;
382 			/*
383 			 * Careful: must close outside locks and that
384 			 * may change the file tree we are looking at.
385 			 */
386 			some++;
387 			mp->gone = 1;
388 			if(mp->ref == 0){
389 				incref(mp);	/* keep it there */
390 				wunlock(&lck);
391 				mdeldev(mp);
392 				goto Again;	/* tree can change */
393 			}
394 		}
395 	}
396 	wunlock(&lck);
397 	if(some == 0 && alltrees == 0)
398 		error(Enonexist);
399 }
400 
401 static void
402 setdsize(Fsdev* mp, vlong *ilen)
403 {
404 	int	i;
405 	vlong	inlen;
406 	Inner	*in;
407 
408 	dprint("setdsize %s\n", mp->name);
409 	for (i = 0; i < mp->ndevs; i++){
410 		in = mp->inner[i];
411 		in->isize = ilen[i];
412 		inlen = in->isize;
413 		switch(mp->type){
414 		case Finter:
415 			/* truncate to multiple of Blksize */
416 			inlen &= ~(Blksize-1);
417 			in->isize = inlen;
418 			/* fall through */
419 		case Fmirror:
420 			/* use size of smallest inner device */
421 			if (mp->size == 0 || mp->size > inlen)
422 				mp->size = inlen;
423 			break;
424 		case Fcat:
425 			mp->size += inlen;
426 			break;
427 		case Fpart:
428 			if(mp->start > inlen)
429 				error("partition starts after device end");
430 			if(inlen < mp->start + mp->size){
431 				print("#k: %s: partition truncated from "
432 					"%lld to %lld bytes\n", mp->name,
433 					mp->size, inlen - mp->start);
434 				mp->size = inlen - mp->start;
435 			}
436 			break;
437 		}
438 	}
439 	if(mp->type == Finter)
440 		mp->size *= mp->ndevs;
441 }
442 
443 static void
444 validdevname(Tree *t, char *dname)
445 {
446 	int i;
447 
448 	for(i = 0; i < t->nadevs; i++)
449 		if(t->devs[i] != nil && strcmp(t->devs[i]->name, dname) == 0)
450 			error(Eexist);
451 }
452 
453 static void
454 parseconfig(char *a, long n, Cmdbuf **cbp, Cmdtab **ctp)
455 {
456 	Cmdbuf	*cb;
457 	Cmdtab	*ct;
458 
459 	*cbp = cb = parsecmd(a, n);
460 	*ctp = ct = lookupcmd(cb, configs, nelem(configs));
461 
462 	cb->f++;			/* skip command */
463 	cb->nf--;
464 	switch(ct->index){
465 	case Fmirror:
466 	case Fcat:
467 	case Finter:
468 		if(cb->nf < 2)
469 			error("too few arguments for ctl");
470 		if(cb->nf - 1 > Ndevs)
471 			error("too many devices in ctl");
472 		break;
473 	case Fdisk:
474 		if(cb->nf < 1 || cb->nf > 3)
475 			error("ctl usage: disk name [sz dev]");
476 		break;
477 	case Fpart:
478 		if(cb->nf != 4 && (cb->nf != 3 || source == nil))
479 			error("ctl usage: part new [file] off len");
480 		break;
481 	}
482 }
483 
484 static void
485 parsename(char *name, char *disk, char **tree, char **dev)
486 {
487 	char *slash;
488 
489 	slash = strchr(name, '/');
490 	if(slash == nil){
491 		if(disk != nil)
492 			*tree = disk;
493 		else
494 			*tree = "fs";
495 		*dev = name;
496 	}else{
497 		*tree = name;
498 		*slash++ = 0;
499 		*dev = slash;
500 	}
501 	validname(*tree, 0);
502 	validname(*dev, 0);
503 }
504 
505 static vlong
506 getlen(Chan *c)
507 {
508 	uchar	buf[128];	/* old DIRLEN plus a little should be plenty */
509 	Dir	d;
510 	long	l;
511 
512 	l = devtab[c->type]->stat(c, buf, sizeof buf);
513 	convM2D(buf, l, &d, nil);
514 	return d.length;
515 }
516 
517 /*
518  * Process a single line of configuration,
519  * often of the form "cmd newname idev0 idev1".
520  * locking is tricky, because we need a write lock to
521  * add/remove devices yet adding/removing them may lead
522  * to calls to this driver that require a read lock (when
523  * inner devices are also provided by us).
524  */
525 static void
526 mconfig(char* a, long n)
527 {
528 	int	i;
529 	vlong	size, start;
530 	vlong	*ilen;
531 	char	*tname, *dname, *fakef[4];
532 	Chan	**idev;
533 	Cmdbuf	*cb;
534 	Cmdtab	*ct;
535 	Fsdev	*mp;
536 	Inner	*inprv;
537 	Tree	*t;
538 
539 	/* ignore comments & empty lines */
540 	if (*a == '\0' || *a == '#' || *a == '\n')
541 		return;
542 
543 	dprint("mconfig\n");
544 	size = 0;
545 	start = 0;
546 	mp = nil;
547 	cb = nil;
548 	idev = nil;
549 	ilen = nil;
550 
551 	if(waserror()){
552 		free(cb);
553 		nexterror();
554 	}
555 
556 	parseconfig(a, n, &cb, &ct);
557 	switch (ct->index) {
558 	case Fdisk:
559 		kstrdup(&disk, cb->f[0]);
560 		if(cb->nf >= 2)
561 			sectorsz = strtoul(cb->f[1], 0, 0);
562 		else
563 			sectorsz = Sectorsz;
564 		if(cb->nf == 3)
565 			kstrdup(&source, cb->f[2]);
566 		else{
567 			free(source);
568 			source = nil;
569 		}
570 		poperror();
571 		free(cb);
572 		return;
573 	case Fclear:
574 		poperror();
575 		free(cb);
576 		mdelctl("*", "*");		/* del everything */
577 		return;
578 	case Fpart:
579 		if(cb->nf == 3){
580 			/*
581 			 * got a request in the format of sd(3),
582 			 * pretend we got one in our format.
583 			 * later we change end to be len.
584 			 */
585 			fakef[0] = cb->f[0];
586 			fakef[1] = source;
587 			fakef[2] = cb->f[1];
588 			fakef[3] = cb->f[2];
589 			cb->f = fakef;
590 			cb->nf = 4;
591 		}
592 		start = strtoll(cb->f[2], nil, 10);
593 		size =  strtoll(cb->f[3], nil, 10);
594 		if(cb->f == fakef)
595 			size -= start;		/* it was end */
596 		cb->nf -= 2;
597 		break;
598 	}
599 	parsename(cb->f[0], disk, &tname, &dname);
600 	for(i = 1; i < cb->nf; i++)
601 		validname(cb->f[i], 1);
602 
603 	if(ct->index == Fdel){
604 		mdelctl(tname, dname);
605 		poperror();
606 		free(cb);
607 		return;
608 	}
609 
610 	/*
611 	 * Open all inner devices while we have only a read lock.
612 	 */
613 	poperror();
614 	rlock(&lck);
615 	if(waserror()){
616 		runlock(&lck);
617 Fail:
618 		for(i = 1; i < cb->nf; i++)
619 			if(idev != nil && idev[i-1] != nil)
620 				cclose(idev[i]);
621 		if(mp != nil)
622 			mdeldev(mp);
623 		free(idev);
624 		free(ilen);
625 		free(cb);
626 		nexterror();
627 	}
628 	idev = smalloc(sizeof(Chan*) * Ndevs);
629 	ilen = smalloc(sizeof(vlong) * Ndevs);
630 	for(i = 1; i < cb->nf; i++){
631 		idev[i-1] = namec(cb->f[i], Aopen, ORDWR, 0);
632 		ilen[i-1] = getlen(idev[i-1]);
633 	}
634 	poperror();
635 	runlock(&lck);
636 
637 	/*
638 	 * Get a write lock and add the device if we can.
639 	 */
640 	wlock(&lck);
641 	if(waserror()){
642 		wunlock(&lck);
643 		goto Fail;
644 	}
645 
646 	t = lookuptree(tname);
647 	if(t != nil)
648 		validdevname(t, dname);
649 	else
650 		t = treealloc(tname);
651 	if(t == nil)
652 		error("no more trees");
653 	mp = devalloc(t, dname);
654 	if(mp == nil){
655 		if(t->ndevs == 0)	/* it was created for us */
656 			deltree(t);	/* but we will not mdeldev() */
657 		error(Enomem);
658 	}
659 
660 	mp->type = ct->index;
661 	if(mp->type == Fpart){
662 		mp->start = start * sectorsz;
663 		mp->size = size * sectorsz;
664 	}
665 	for(i = 1; i < cb->nf; i++){
666 		inprv = mp->inner[i-1] = mallocz(sizeof(Inner), 1);
667 		if(inprv == nil)
668 			error(Enomem);
669 		mp->ndevs++;
670 		kstrdup(&inprv->iname, cb->f[i]);
671 		inprv->idev = idev[i-1];
672 		idev[i-1] = nil;
673 	}
674 	setdsize(mp, ilen);
675 
676 	poperror();
677 	wunlock(&lck);
678 	free(idev);
679 	free(ilen);
680 	free(cb);
681 }
682 
683 static void
684 rdconf(void)
685 {
686 	int mustrd;
687 	char *c, *e, *p, *s;
688 	Chan *cc;
689 	static int configed;
690 
691 	/* only read config file once */
692 	if (configed)
693 		return;
694 	configed = 1;
695 
696 	dprint("rdconf\n");
697 	/* add the std "fs" tree */
698 	trees[0] = &fstree;
699 	ntrees++;
700 	fstree.name = "fs";
701 
702 	/* identify the config file */
703 	s = getconf("fsconfig");
704 	if (s == nil){
705 		mustrd = 0;
706 		s = "/dev/sdC0/fscfg";
707 	} else
708 		mustrd = 1;
709 
710 	/* read it */
711 	cc = nil;
712 	c = nil;
713 	if (waserror()){
714 		if (cc != nil)
715 			cclose(cc);
716 		if (c)
717 			free(c);
718 		if (!mustrd)
719 			return;
720 		nexterror();
721 	}
722 	cc = namec(s, Aopen, OREAD, 0);
723 	devtab[cc->type]->read(cc, confstr, sizeof confstr, 0);
724 	cclose(cc);
725 	cc = nil;
726 
727 	/* validate, copy and erase config; mconfig will repopulate confstr */
728 	if (strncmp(confstr, cfgstr, sizeof cfgstr - 1) != 0)
729 		error("bad #k config, first line must be: 'fsdev:\\n'");
730 	kstrdup(&c, confstr + sizeof cfgstr - 1);
731 	memset(confstr, 0, sizeof confstr);
732 
733 	/* process config copy one line at a time */
734 	for (p = c; p != nil && *p != '\0'; p = e){
735 		e = strchr(p, '\n');
736 		if (e == nil)
737 			e = p + strlen(p);
738 		else
739 			e++;
740 		mconfig(p, e - p);
741 	}
742 	USED(cc);		/* until now, can be used in waserror clause */
743 	poperror();
744 }
745 
746 static int
747 mgen(Chan *c, char*, Dirtab*, int, int i, Dir *dp)
748 {
749 	int	treeno;
750 	Fsdev	*mp;
751 	Qid	qid;
752 	Tree	*t;
753 
754 	dprint("mgen %#ullx %d\n", c->qid.path, i);
755 	qid.type = QTDIR;
756 	qid.vers = 0;
757 	if(c->qid.path == Qtop){
758 		if(i == DEVDOTDOT){
759 			devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
760 			return 1;
761 		}
762 		t = gettree(i, Optional);
763 		if(t == nil){
764 			dprint("no\n");
765 			return -1;
766 		}
767 		qid.path = mkpath(i, Qdir);
768 		devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
769 		return 1;
770 	}
771 
772 	treeno = path2treeno(c->qid.path);
773 	t = gettree(treeno, Optional);
774 	if(t == nil){
775 		dprint("no\n");
776 		return -1;
777 	}
778 	if((c->qid.type & QTDIR) != 0){
779 		if(i == DEVDOTDOT){
780 			devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
781 			return 1;
782 		}
783 		if(treeno == 0){
784 			/* take care of #k/fs/ctl */
785 			if(i == 0){
786 				devdir(c, cqid, "ctl", 0, eve, 0664, dp);
787 				return 1;
788 			}
789 			i--;
790 		}
791 		mp = getdev(t, i, Optional);
792 		if(mp == nil){
793 			dprint("no\n");
794 			return -1;
795 		}
796 		qid.type = QTFILE;
797 		qid.vers = mp->vers;
798 		qid.path = mkpath(treeno, Qfirst+i);
799 		devdir(c, qid, mp->name, mp->size, eve, 0664, dp);
800 		return 1;
801 	}
802 
803 	if(i == DEVDOTDOT){
804 		qid.path = mkpath(treeno, Qdir);
805 		devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
806 		return 1;
807 	}
808 	dprint("no\n");
809 	return -1;
810 }
811 
812 static Chan*
813 mattach(char *spec)
814 {
815 	dprint("mattach\n");
816 	return devattach(fsdevtab.dc, spec);
817 }
818 
819 static Walkqid*
820 mwalk(Chan *c, Chan *nc, char **name, int nname)
821 {
822 	Walkqid *wq;
823 
824 	rdconf();
825 
826 	dprint("mwalk %llux\n", c->qid.path);
827 	rlock(&lck);
828 	if(waserror()){
829 		runlock(&lck);
830 		nexterror();
831 	}
832 	wq = devwalk(c, nc, name, nname, 0, 0, mgen);
833 	poperror();
834 	runlock(&lck);
835 	return wq;
836 }
837 
838 static int
839 mstat(Chan *c, uchar *db, int n)
840 {
841 	int	p;
842 	Dir	d;
843 	Fsdev	*mp;
844 	Qid	q;
845 	Tree	*t;
846 
847 	dprint("mstat %llux\n", c->qid.path);
848 	rlock(&lck);
849 	if(waserror()){
850 		runlock(&lck);
851 		nexterror();
852 	}
853 	p = c->qid.path;
854 	memset(&d, 0, sizeof d);
855 	switch(p){
856 	case Qtop:
857 		devdir(c, tqid, "#k", 0, eve, DMDIR|0775, &d);
858 		break;
859 	case Qctl:
860 		devdir(c, cqid, "ctl", 0, eve, 0664, &d);
861 		break;
862 	default:
863 		t = gettree(path2treeno(p), Mustexist);
864 		if(c->qid.type & QTDIR)
865 			devdir(c, c->qid, t->name, 0, eve, DMDIR|0775, &d);
866 		else{
867 			mp = getdev(t, path2devno(p) - Qfirst, Mustexist);
868 			q = c->qid;
869 			q.vers = mp->vers;
870 			devdir(c, q, mp->name, mp->size, eve, 0664, &d);
871 		}
872 	}
873 	n = convD2M(&d, db, n);
874 	if (n == 0)
875 		error(Ebadarg);
876 	poperror();
877 	runlock(&lck);
878 	return n;
879 }
880 
881 static Chan*
882 mopen(Chan *c, int omode)
883 {
884 	int	q;
885 	Fsdev	*mp;
886 
887 	dprint("mopen %llux\n", c->qid.path);
888 	if((c->qid.type & QTDIR) && omode != OREAD)
889 		error(Eperm);
890 	if(c->qid.path != Qctl && (c->qid.type&QTDIR) == 0){
891 		rlock(&lck);
892 		if(waserror()){
893 			runlock(&lck);
894 			nexterror();
895 		}
896 		q = c->qid.path;
897 		mp = path2dev(q);
898 		if(mp->gone)
899 			error(Egone);
900 		incref(mp);
901 		poperror();
902 		runlock(&lck);
903 	}
904 	/*
905 	 * Our mgen does not return the info for the qid
906 	 * but only for its children. Don't use devopen here.
907 	 */
908 	c->offset = 0;
909 	c->mode = openmode(omode & ~OTRUNC);
910 	c->flag |= COPEN;
911 	return c;
912 }
913 
914 static void
915 mclose(Chan *c)
916 {
917 	int	mustdel, q;
918 	Fsdev	*mp;
919 
920 	dprint("mclose %llux\n", c->qid.path);
921 	if(c->qid.type & QTDIR || !(c->flag & COPEN))
922 		return;
923 	rlock(&lck);
924 	if(waserror()){
925 		runlock(&lck);
926 		nexterror();
927 	}
928 	mustdel = 0;
929 	mp = nil;
930 	q = c->qid.path;
931 	if(q == Qctl){
932 		free(disk);
933 		disk = nil;	/* restore defaults */
934 		free(source);
935 		source = nil;
936 		sectorsz = Sectorsz;
937 	}else{
938 		mp = path2dev(q);
939 		if(mp->gone != 0 && mp->ref == 1)
940 			mustdel = 1;
941 		else
942 			decref(mp);
943 	}
944 	poperror();
945 	runlock(&lck);
946 	if(mustdel)
947 		mdeldev(mp);
948 }
949 
950 static long
951 io(Fsdev *mp, Inner *in, int isread, void *a, long l, vlong off)
952 {
953 	long wl;
954 	Chan	*mc;
955 
956 	mc = in->idev;
957 	if(mc == nil)
958 		error(Egone);
959 	if (waserror()) {
960 		print("#k: %s: byte %,lld count %ld (of #k/%s): %s error: %s\n",
961 			in->iname, off, l, mp->name, (isread? "read": "write"),
962 			(up && up->errstr? up->errstr: ""));
963 		nexterror();
964 	}
965 	if (isread)
966 		wl = devtab[mc->type]->read(mc, a, l, off);
967 	else
968 		wl = devtab[mc->type]->write(mc, a, l, off);
969 	poperror();
970 	return wl;
971 }
972 
973 /* NB: a transfer could span multiple inner devices */
974 static long
975 catio(Fsdev *mp, int isread, void *a, long n, vlong off)
976 {
977 	int	i;
978 	long	l, res;
979 	Inner	*in;
980 
981 	if(debug)
982 		print("catio %d %p %ld %lld\n", isread, a, n, off);
983 	res = n;
984 	for (i = 0; n > 0 && i < mp->ndevs; i++){
985 		in = mp->inner[i];
986 		if (off >= in->isize){
987 			off -= in->isize;
988 			continue;		/* not there yet */
989 		}
990 		if (off + n > in->isize)
991 			l = in->isize - off;
992 		else
993 			l = n;
994 		if(debug)
995 			print("\tdev %d %p %ld %lld\n", i, a, l, off);
996 
997 		if (io(mp, in, isread, a, l, off) != l)
998 			error(Eio);
999 
1000 		a = (char*)a + l;
1001 		off = 0;
1002 		n -= l;
1003 	}
1004 	if(debug)
1005 		print("\tres %ld\n", res - n);
1006 	return res - n;
1007 }
1008 
1009 static long
1010 interio(Fsdev *mp, int isread, void *a, long n, vlong off)
1011 {
1012 	int	i;
1013 	long	boff, res, l, wl, wsz;
1014 	vlong	woff, blk, mblk;
1015 
1016 	blk  = off / Blksize;
1017 	boff = off % Blksize;
1018 	wsz  = Blksize - boff;
1019 	res = n;
1020 	while(n > 0){
1021 		mblk = blk / mp->ndevs;
1022 		i    = blk % mp->ndevs;
1023 		woff = mblk*Blksize + boff;
1024 		if (n > wsz)
1025 			l = wsz;
1026 		else
1027 			l = n;
1028 
1029 		wl = io(mp, mp->inner[i], isread, a, l, woff);
1030 		if (wl != l)
1031 			error(Eio);
1032 
1033 		blk++;
1034 		boff = 0;
1035 		wsz = Blksize;
1036 		a = (char*)a + l;
1037 		n -= l;
1038 	}
1039 	return res;
1040 }
1041 
1042 static char*
1043 seprintconf(char *s, char *e)
1044 {
1045 	int	i, j;
1046 	Tree	*t;
1047 
1048 	*s = 0;
1049 	for(i = 0; i < ntrees; i++){
1050 		t = trees[i];
1051 		if(t != nil)
1052 			for(j = 0; j < t->nadevs; j++)
1053 				if(t->devs[j] != nil)
1054 					s = seprintdev(s, e, t->devs[j]);
1055 	}
1056 	return s;
1057 }
1058 
1059 static long
1060 mread(Chan *c, void *a, long n, vlong off)
1061 {
1062 	int	i, retry;
1063 	long	l, res;
1064 	Fsdev	*mp;
1065 	Tree	*t;
1066 
1067 	dprint("mread %llux\n", c->qid.path);
1068 	rlock(&lck);
1069 	if(waserror()){
1070 		runlock(&lck);
1071 		nexterror();
1072 	}
1073 	res = -1;
1074 	if(c->qid.type & QTDIR){
1075 		res = devdirread(c, a, n, 0, 0, mgen);
1076 		goto Done;
1077 	}
1078 	if(c->qid.path == Qctl){
1079 		seprintconf(confstr, confstr + sizeof(confstr));
1080 		res = readstr((long)off, a, n, confstr);
1081 		goto Done;
1082 	}
1083 
1084 	t = gettree(path2treeno(c->qid.path), Mustexist);
1085 	mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1086 
1087 	if(off >= mp->size){
1088 		res = 0;
1089 		goto Done;
1090 	}
1091 	if(off + n > mp->size)
1092 		n = mp->size - off;
1093 	if(n == 0){
1094 		res = 0;
1095 		goto Done;
1096 	}
1097 
1098 	switch(mp->type){
1099 	case Fcat:
1100 		res = catio(mp, Isread, a, n, off);
1101 		break;
1102 	case Finter:
1103 		res = interio(mp, Isread, a, n, off);
1104 		break;
1105 	case Fpart:
1106 		res = io(mp, mp->inner[0], Isread, a, n, mp->start + off);
1107 		break;
1108 	case Fmirror:
1109 		retry = 0;
1110 		do {
1111 			if (retry > 0) {
1112 				print("#k/%s: retry %d read for byte %,lld "
1113 					"count %ld: %s\n", mp->name, retry, off,
1114 					n, (up && up->errstr? up->errstr: ""));
1115 				/*
1116 				 * pause before retrying in case it's due to
1117 				 * a transient bus or controller problem.
1118 				 */
1119 				tsleep(&up->sleep, return0, 0, Retrypause);
1120 			}
1121 			for (i = 0; i < mp->ndevs; i++){
1122 				if (waserror())
1123 					continue;
1124 				l = io(mp, mp->inner[i], Isread, a, n, off);
1125 				poperror();
1126 				if (l >= 0){
1127 					res = l;
1128 					break;		/* read a good copy */
1129 				}
1130 			}
1131 		} while (i == mp->ndevs && ++retry <= Maxretries);
1132 		if (retry > Maxretries) {
1133 			/* no mirror had a good copy of the block */
1134 			print("#k/%s: byte %,lld count %ld: CAN'T READ "
1135 				"from mirror: %s\n", mp->name, off, n,
1136 				(up && up->errstr? up->errstr: ""));
1137 			error(Eio);
1138 		} else if (retry > 0)
1139 			print("#k/%s: byte %,lld count %ld: retry read OK "
1140 				"from mirror: %s\n", mp->name, off, n,
1141 				(up && up->errstr? up->errstr: ""));
1142 		break;
1143 	}
1144 Done:
1145 	poperror();
1146 	runlock(&lck);
1147 	return res;
1148 }
1149 
1150 static long
1151 mwrite(Chan *c, void *a, long n, vlong off)
1152 {
1153 	int	i, allbad, anybad, retry;
1154 	long	l, res;
1155 	Fsdev	*mp;
1156 	Tree	*t;
1157 
1158 	dprint("mwrite %llux\n", c->qid.path);
1159 	if (c->qid.type & QTDIR)
1160 		error(Eisdir);
1161 	if (c->qid.path == Qctl){
1162 		mconfig(a, n);
1163 		return n;
1164 	}
1165 
1166 	rlock(&lck);
1167 	if(waserror()){
1168 		runlock(&lck);
1169 		nexterror();
1170 	}
1171 
1172 	t = gettree(path2treeno(c->qid.path), Mustexist);
1173 	mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1174 
1175 	if(off >= mp->size){
1176 		res = 0;
1177 		goto Done;
1178 	}
1179 	if(off + n > mp->size)
1180 		n = mp->size - off;
1181 	if(n == 0){
1182 		res = 0;
1183 		goto Done;
1184 	}
1185 	res = n;
1186 	switch(mp->type){
1187 	case Fcat:
1188 		res = catio(mp, Iswrite, a, n, off);
1189 		break;
1190 	case Finter:
1191 		res = interio(mp, Iswrite, a, n, off);
1192 		break;
1193 	case Fpart:
1194 		res = io(mp, mp->inner[0], Iswrite, a, n, mp->start + off);
1195 		if (res != n)
1196 			error(Eio);
1197 		break;
1198 	case Fmirror:
1199 		retry = 0;
1200 		do {
1201 			if (retry > 0) {
1202 				print("#k/%s: retry %d write for byte %,lld "
1203 					"count %ld: %s\n", mp->name, retry, off,
1204 					n, (up && up->errstr? up->errstr: ""));
1205 				/*
1206 				 * pause before retrying in case it's due to
1207 				 * a transient bus or controller problem.
1208 				 */
1209 				tsleep(&up->sleep, return0, 0, Retrypause);
1210 			}
1211 			allbad = 1;
1212 			anybad = 0;
1213 			for (i = mp->ndevs - 1; i >= 0; i--){
1214 				if (waserror()) {
1215 					anybad = 1;
1216 					continue;
1217 				}
1218 				l = io(mp, mp->inner[i], Iswrite, a, n, off);
1219 				poperror();
1220 				if (l == n)
1221 					allbad = 0;	/* wrote a good copy */
1222 				else
1223 					anybad = 1;
1224 			}
1225 		} while (anybad && ++retry <= Maxretries);
1226 		if (allbad) {
1227 			/* no mirror took a good copy of the block */
1228 			print("#k/%s: byte %,lld count %ld: CAN'T WRITE "
1229 				"to mirror: %s\n", mp->name, off, n,
1230 				(up && up->errstr? up->errstr: ""));
1231 			error(Eio);
1232 		} else if (retry > 0)
1233 			print("#k/%s: byte %,lld count %ld: retry wrote OK "
1234 				"to mirror: %s\n", mp->name, off, n,
1235 				(up && up->errstr? up->errstr: ""));
1236 
1237 		break;
1238 	}
1239 Done:
1240 	poperror();
1241 	runlock(&lck);
1242 	return res;
1243 }
1244 
1245 Dev fsdevtab = {
1246 	'k',
1247 	"devfs",
1248 
1249 	devreset,
1250 	devinit,
1251 	devshutdown,
1252 	mattach,
1253 	mwalk,
1254 	mstat,
1255 	mopen,
1256 	devcreate,
1257 	mclose,
1258 	mread,
1259 	devbread,
1260 	mwrite,
1261 	devbwrite,
1262 	devremove,
1263 	devwstat,
1264 	devpower,
1265 	devconfig,
1266 };
1267