xref: /plan9-contrib/sys/src/9/pc/sdvirtio10.c (revision a4e4894bd40aeb7365fba5a08dabe3bdf328c45b)
1 /*
2  * virtio 1.0 disk driver
3  * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
4  *
5  * In contrast to sdvirtio.c, this driver handles the non-legacy
6  * interface for virtio disk which uses mmio for all register accesses
7  * and requires a laborate pci capability structure dance to get working.
8  *
9  * It is kind of pointless as it is most likely slower than
10  * port i/o (harder to emulate on the pc platform).
11  *
12  * The reason why this driver is needed it is that vultr set the
13  * disable-legacy=on option in the -device parameter for qemu
14  * on their hypervisor.
15  */
16 #include "u.h"
17 #include "../port/lib.h"
18 #include "mem.h"
19 #include "dat.h"
20 #include "fns.h"
21 #include "io.h"
22 #include "ureg.h"
23 #include "../port/error.h"
24 
25 #include "../port/sd.h"
26 
27 typedef struct Vscsidev Vscsidev;
28 typedef struct Vblkdev Vblkdev;
29 
30 typedef struct Vconfig Vconfig;
31 typedef struct Vring Vring;
32 typedef struct Vdesc Vdesc;
33 typedef struct Vused Vused;
34 typedef struct Vqueue Vqueue;
35 typedef struct Vdev Vdev;
36 
37 
38 /* device types */
39 enum {
40 	TypBlk	= 2,
41 	TypSCSI	= 8,
42 };
43 
44 /* status flags */
45 enum {
46 	Acknowledge = 1,
47 	Driver = 2,
48 	FeaturesOk = 8,
49 	DriverOk = 4,
50 	Failed = 0x80,
51 };
52 
53 /* descriptor flags */
54 enum {
55 	Next = 1,
56 	Write = 2,
57 	Indirect = 4,
58 };
59 
60 /* struct sizes */
61 enum {
62 	VringSize = 4,
63 };
64 
65 enum {
66 	CDBSIZE		= 32,
67 	SENSESIZE	= 96,
68 };
69 
70 
71 struct Vscsidev
72 {
73 	u32int	num_queues;
74 	u32int	seg_max;
75 	u32int	max_sectors;
76 	u32int	cmd_per_lun;
77 	u32int	event_info_size;
78 	u32int	sense_size;
79 	u32int	cdb_size;
80 	u16int	max_channel;
81 	u16int	max_target;
82 	u32int	max_lun;
83 };
84 
85 struct Vblkdev
86 {
87 	u64int	capacity;
88 };
89 
90 struct Vconfig {
91 	u32int	devfeatsel;
92 	u32int	devfeat;
93 	u32int	drvfeatsel;
94 	u32int	drvfeat;
95 
96 	u16int	msixcfg;
97 	u16int	nqueues;
98 
99 	u8int	status;
100 	u8int	cfggen;
101 	u16int	queuesel;
102 
103 	u16int	queuesize;
104 	u16int	queuemsixvect;
105 
106 	u16int	queueenable;
107 	u16int	queuenotifyoff;
108 
109 	u64int	queuedesc;
110 	u64int	queueavail;
111 	u64int	queueused;
112 };
113 
114 struct Vring
115 {
116 	u16int	flags;
117 	u16int	idx;
118 };
119 
120 struct Vdesc
121 {
122 	u64int	addr;
123 	u32int	len;
124 	u16int	flags;
125 	u16int	next;
126 };
127 
128 struct Vused
129 {
130 	u32int	id;
131 	u32int	len;
132 };
133 
134 struct Vqueue
135 {
136 	Lock;
137 
138 	Vdev	*dev;
139 	void	*notify;
140 	int	idx;
141 
142 	int	size;
143 
144 	int	free;
145 	int	nfree;
146 
147 	Vdesc	*desc;
148 
149 	Vring	*avail;
150 	u16int	*availent;
151 	u16int	*availevent;
152 
153 	Vring	*used;
154 	Vused	*usedent;
155 	u16int	*usedevent;
156 	u16int	lastused;
157 
158 	void	*rock[];
159 };
160 
161 struct Vdev
162 {
163 	int	typ;
164 
165 	Pcidev	*pci;
166 
167 	uvlong	port;
168 	ulong	feat[2];
169 
170 	int	nqueue;
171 	Vqueue	*queue[16];
172 
173 	void	*dev;	/* device specific config (for scsi) */
174 
175 	/* registers */
176 	Vconfig	*cfg;
177 	u8int	*isr;
178 	u8int	*notify;
179 	u32int	notifyoffmult;
180 
181 	Vdev	*next;
182 };
183 
184 static Vqueue*
mkvqueue(int size)185 mkvqueue(int size)
186 {
187 	Vqueue *q;
188 	uchar *p;
189 	int i;
190 
191 	q = malloc(sizeof(*q) + sizeof(void*)*size);
192 	p = mallocalign(
193 		PGROUND(sizeof(Vdesc)*size +
194 			VringSize +
195 			sizeof(u16int)*size +
196 			sizeof(u16int)) +
197 		PGROUND(VringSize +
198 			sizeof(Vused)*size +
199 			sizeof(u16int)),
200 		BY2PG, 0, 0);
201 	if(p == nil || q == nil){
202 		print("virtio: no memory for Vqueue\n");
203 		free(p);
204 		free(q);
205 		return nil;
206 	}
207 
208 	q->desc = (void*)p;
209 	p += sizeof(Vdesc)*size;
210 	q->avail = (void*)p;
211 	p += VringSize;
212 	q->availent = (void*)p;
213 	p += sizeof(u16int)*size;
214 	q->availevent = (void*)p;
215 	p += sizeof(u16int);
216 
217 	p = (uchar*)PGROUND((uintptr)p);
218 	q->used = (void*)p;
219 	p += VringSize;
220 	q->usedent = (void*)p;
221 	p += sizeof(Vused)*size;
222 	q->usedevent = (void*)p;
223 
224 	q->free = -1;
225 	q->nfree = q->size = size;
226 	for(i=0; i<size; i++){
227 		q->desc[i].next = q->free;
228 		q->free = i;
229 	}
230 
231 	return q;
232 }
233 
234 static int
matchvirtiocfgcap(Pcidev * p,int cap,int off,int typ)235 matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
236 {
237 	int bar;
238 
239 	if(cap != 9 || pcicfgr8(p, off+3) != typ)
240 		return 1;
241 
242 	/* skip invalid or non memory bars */
243 	bar = pcicfgr8(p, off+4);
244 	if(bar < 0 || bar >= nelem(p->mem)
245 	|| p->mem[bar].size == 0
246 	|| (p->mem[bar].bar & 3) != 0)
247 		return 1;
248 
249 	return 0;
250 }
251 
252 static int
virtiocap(Pcidev * p,int typ)253 virtiocap(Pcidev *p, int typ)
254 {
255 	return pcienumcaps(p, matchvirtiocfgcap, typ);
256 }
257 
258 static void*
virtiomapregs(Pcidev * p,int cap,int size)259 virtiomapregs(Pcidev *p, int cap, int size)
260 {
261 	int bar, len;
262 	uvlong addr;
263 
264 	if(cap < 0)
265 		return nil;
266 	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
267 	addr = pcicfgr32(p, cap+8);
268 	len = pcicfgr32(p, cap+12);
269 	if(size <= 0)
270 		size = len;
271 	else if(len < size)
272 		return nil;
273 	if(addr+len > p->mem[bar].size)
274 		return nil;
275 	addr += p->mem[bar].bar & ~0xFULL;
276 	return vmap(addr, size);
277 }
278 
279 static Vdev*
viopnpdevs(int typ)280 viopnpdevs(int typ)
281 {
282 	Vdev *vd, *h, *t;
283 	Vconfig *cfg;
284 	Vqueue *q;
285 	Pcidev *p;
286 	int cap, bar;
287 	int n, i;
288 
289 	h = t = nil;
290 	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
291 		if(p->rid == 0)
292 			continue;
293 		if((cap = virtiocap(p, 1)) < 0)
294 			continue;
295 		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
296 		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
297 		if(cfg == nil)
298 			continue;
299 		if((vd = malloc(sizeof(*vd))) == nil){
300 			print("virtio: no memory for Vdev\n");
301 			break;
302 		}
303 		vd->port = p->mem[bar].bar & ~0xFULL;
304 		vd->typ = typ;
305 		vd->pci = p;
306 		vd->cfg = cfg;
307 
308 		vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
309 		if(vd->isr == nil){
310 Baddev:
311 			/* TODO: vunmap */
312 			free(vd);
313 			continue;
314 		}
315 		cap = virtiocap(p, 2);
316 		vd->notify = virtiomapregs(p, cap, 0);
317 		if(vd->notify == nil)
318 			goto Baddev;
319 		vd->notifyoffmult = pcicfgr32(p, cap+16);
320 
321 		/* reset */
322 		cfg->status = 0;
323 		while(cfg->status != 0)
324 			delay(1);
325 		cfg->status = Acknowledge|Driver;
326 
327 		/* negotiate feature bits */
328 		cfg->devfeatsel = 1;
329 		vd->feat[1] = cfg->devfeat;
330 		cfg->devfeatsel = 0;
331 		vd->feat[0] = cfg->devfeat;
332 		cfg->drvfeatsel = 1;
333 		cfg->drvfeat = vd->feat[1] & 1;
334 		cfg->drvfeatsel = 0;
335 		cfg->drvfeat = 0;
336 		cfg->status |= FeaturesOk;
337 
338 		for(i=0; i<nelem(vd->queue); i++){
339 			cfg->queuesel = i;
340 			n = cfg->queuesize;
341 			if(n == 0 || (n & (n-1)) != 0)
342 				break;
343 			if((q = mkvqueue(n)) == nil)
344 				break;
345 			q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
346 			q->dev = vd;
347 			q->idx = i;
348 			vd->queue[i] = q;
349 			coherence();
350 			cfg->queuedesc = PADDR(q->desc);
351 			cfg->queueavail = PADDR(q->avail);
352 			cfg->queueused = PADDR(q->used);
353 		}
354 		vd->nqueue = i;
355 
356 		if(h == nil)
357 			h = vd;
358 		else
359 			t->next = vd;
360 		t = vd;
361 	}
362 
363 	return h;
364 }
365 
366 struct Rock {
367 	int done;
368 	Rendez *sleep;
369 };
370 
371 static void
vqinterrupt(Vqueue * q)372 vqinterrupt(Vqueue *q)
373 {
374 	int id, free, m;
375 	struct Rock *r;
376 	Rendez *z;
377 
378 	m = q->size-1;
379 
380 	ilock(q);
381 	while((q->lastused ^ q->used->idx) & m){
382 		id = q->usedent[q->lastused++ & m].id;
383 		if(r = q->rock[id]){
384 			q->rock[id] = nil;
385 			z = r->sleep;
386 			r->done = 1;	/* hands off */
387 			if(z != nil)
388 				wakeup(z);
389 		}
390 		do {
391 			free = id;
392 			id = q->desc[free].next;
393 			q->desc[free].next = q->free;
394 			q->free = free;
395 			q->nfree++;
396 		} while(q->desc[free].flags & Next);
397 	}
398 	iunlock(q);
399 }
400 
401 static void
viointerrupt(Ureg *,void * arg)402 viointerrupt(Ureg *, void *arg)
403 {
404 	Vdev *vd = arg;
405 
406 	if(vd->isr[0] & 1)
407 		vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
408 }
409 
410 static int
viodone(void * arg)411 viodone(void *arg)
412 {
413 	return ((struct Rock*)arg)->done;
414 }
415 
416 static void
vqio(Vqueue * q,int head)417 vqio(Vqueue *q, int head)
418 {
419 	struct Rock rock;
420 
421 	rock.done = 0;
422 	rock.sleep = &up->sleep;
423 	q->rock[head] = &rock;
424 	q->availent[q->avail->idx & (q->size-1)] = head;
425 	coherence();
426 	q->avail->idx++;
427 	iunlock(q);
428 	if((q->used->flags & 1) == 0)
429 		*((u16int*)q->notify) = q->idx;
430 	while(!rock.done){
431 		while(waserror())
432 			;
433 		tsleep(rock.sleep, viodone, &rock, 1000);
434 		poperror();
435 
436 		if(!rock.done)
437 			vqinterrupt(q);
438 	}
439 }
440 
441 static int
vioblkreq(Vdev * vd,int typ,void * a,long count,long secsize,uvlong lba)442 vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
443 {
444 	int need, free, head;
445 	Vqueue *q;
446 	Vdesc *d;
447 
448 	u8int status;
449 	struct Vioblkreqhdr {
450 		u32int	typ;
451 		u32int	prio;
452 		u64int	lba;
453 	} req;
454 
455 	need = 2;
456 	if(a != nil)
457 		need = 3;
458 
459 	status = -1;
460 	req.typ = typ;
461 	req.prio = 0;
462 	req.lba = lba;
463 
464 	q = vd->queue[0];
465 	ilock(q);
466 	while(q->nfree < need){
467 		iunlock(q);
468 
469 		if(!waserror())
470 			tsleep(&up->sleep, return0, 0, 500);
471 		poperror();
472 
473 		ilock(q);
474 	}
475 
476 	head = free = q->free;
477 
478 	d = &q->desc[free]; free = d->next;
479 	d->addr = PADDR(&req);
480 	d->len = sizeof(req);
481 	d->flags = Next;
482 
483 	if(a != nil){
484 		d = &q->desc[free]; free = d->next;
485 		d->addr = PADDR(a);
486 		d->len = secsize*count;
487 		d->flags = typ ? Next : (Write|Next);
488 	}
489 
490 	d = &q->desc[free]; free = d->next;
491 	d->addr = PADDR(&status);
492 	d->len = sizeof(status);
493 	d->flags = Write;
494 
495 	q->free = free;
496 	q->nfree -= need;
497 
498 	/* queue io, unlock and wait for completion */
499 	vqio(q, head);
500 
501 	return status;
502 }
503 
504 static int
vioscsireq(SDreq * r)505 vioscsireq(SDreq *r)
506 {
507 	u8int resp[4+4+2+2+SENSESIZE];
508 	u8int req[8+8+3+CDBSIZE];
509 	int free, head;
510 	u32int len;
511 	Vqueue *q;
512 	Vdesc *d;
513 	Vdev *vd;
514 	SDunit *u;
515 	Vscsidev *scsi;
516 
517 	u = r->unit;
518 	vd = u->dev->ctlr;
519 	scsi = vd->dev;
520 
521 	memset(resp, 0, sizeof(resp));
522 	memset(req, 0, sizeof(req));
523 	req[0] = 1;
524 	req[1] = u->subno;
525 	req[2] = r->lun>>8;
526 	req[3] = r->lun&0xFF;
527 	*(u64int*)(&req[8]) = (uintptr)r;
528 
529 	memmove(&req[8+8+3], r->cmd, r->clen);
530 
531 	q = vd->queue[2];
532 	ilock(q);
533 	while(q->nfree < 3){
534 		iunlock(q);
535 
536 		if(!waserror())
537 			tsleep(&up->sleep, return0, 0, 500);
538 		poperror();
539 
540 		ilock(q);
541 	}
542 
543 	head = free = q->free;
544 
545 	d = &q->desc[free]; free = d->next;
546 	d->addr = PADDR(req);
547 	d->len = 8+8+3+scsi->cdb_size;
548 	d->flags = Next;
549 
550 	if(r->write && r->dlen > 0){
551 		d = &q->desc[free]; free = d->next;
552 		d->addr = PADDR(r->data);
553 		d->len = r->dlen;
554 		d->flags = Next;
555 	}
556 
557 	d = &q->desc[free]; free = d->next;
558 	d->addr = PADDR(resp);
559 	d->len = 4+4+2+2+scsi->sense_size;
560 	d->flags = Write;
561 
562 	if(!r->write && r->dlen > 0){
563 		d->flags |= Next;
564 
565 		d = &q->desc[free]; free = d->next;
566 		d->addr = PADDR(r->data);
567 		d->len = r->dlen;
568 		d->flags = Write;
569 	}
570 
571 	q->free = free;
572 	q->nfree -= 2 + (r->dlen > 0);
573 
574 	/* queue io, unlock and wait for completion */
575 	vqio(q, head);
576 
577 	/* response+status */
578 	r->status = resp[10];
579 	if(resp[11] != 0)
580 		r->status = SDcheck;
581 
582 	/* sense_len */
583 	len = *((u32int*)&resp[0]);
584 	if(len > 0){
585 		if(len > sizeof(r->sense))
586 			len = sizeof(r->sense);
587 		memmove(r->sense, &resp[4+4+2+2], len);
588 		r->flags |= SDvalidsense;
589 	}
590 
591 	/* data residue */
592 	len = *((u32int*)&resp[4]);
593 	if(len > r->dlen)
594 		r->rlen = 0;
595 	else
596 		r->rlen = r->dlen - len;
597 
598 	return r->status;
599 
600 }
601 
602 static long
viobio(SDunit * u,int lun,int write,void * a,long count,uvlong lba)603 viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
604 {
605 	long ss, cc, max, ret;
606 	Vdev *vd;
607 
608 	vd = u->dev->ctlr;
609 	if(vd->typ == TypSCSI)
610 		return scsibio(u, lun, write, a, count, lba);
611 
612 	max = 32;
613 	ss = u->secsize;
614 	ret = 0;
615 	while(count > 0){
616 		if((cc = count) > max)
617 			cc = max;
618 		if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
619 			error(Eio);
620 		ret += cc*ss;
621 		count -= cc;
622 		lba += cc;
623 	}
624 	return ret;
625 }
626 
627 enum {
628 	SDread,
629 	SDwrite,
630 };
631 
632 static int
viorio(SDreq * r)633 viorio(SDreq *r)
634 {
635 	int i, count, rw;
636 	uvlong lba;
637 	SDunit *u;
638 	Vdev *vd;
639 
640 	u = r->unit;
641 	vd = u->dev->ctlr;
642 	if(vd->typ == TypSCSI)
643 		return vioscsireq(r);
644 	if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
645 		if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
646 			return sdsetsense(r, SDcheck, 3, 0xc, 2);
647 		return sdsetsense(r, SDok, 0, 0, 0);
648 	}
649 	if((i = sdfakescsi(r, nil, 0)) != SDnostatus)
650 		return r->status = i;
651 	if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
652 		return i;
653 	r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
654 	return r->status = SDok;
655 }
656 
657 static int
vioonline(SDunit * u)658 vioonline(SDunit *u)
659 {
660 	Vdev *vd;
661 	Vblkdev *blk;
662 	uvlong cap;
663 
664 	vd = u->dev->ctlr;
665 	if(vd->typ == TypSCSI)
666 		return scsionline(u);
667 
668 	blk = vd->dev;
669 	cap = blk->capacity;
670 	if(u->sectors != cap){
671 		u->sectors = cap;
672 		u->secsize = 512;
673 		return 2;
674 	}
675 	return 1;
676 }
677 
678 static int
vioverify(SDunit * u)679 vioverify(SDunit *u)
680 {
681 	Vdev *vd;
682 
683 	vd = u->dev->ctlr;
684 	if(vd->typ == TypSCSI)
685 		return scsiverify(u);
686 
687 	return 1;
688 }
689 
690 SDifc sdvirtio10ifc;
691 
692 static int
vioenable(SDev * sd)693 vioenable(SDev *sd)
694 {
695 	char name[32];
696 	Vdev *vd;
697 	int i;
698 
699 	vd = sd->ctlr;
700 	pcisetbme(vd->pci);
701 	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
702 	intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
703 	coherence();
704 
705 	for(i = 0; i < vd->nqueue; i++){
706 		vd->cfg->queuesel = i;
707 		vd->cfg->queueenable = 1;
708 	}
709 	vd->cfg->status |= DriverOk;
710 
711 	return 1;
712 }
713 
714 static int
viodisable(SDev * sd)715 viodisable(SDev *sd)
716 {
717 	char name[32];
718 	Vdev *vd;
719 
720 	vd = sd->ctlr;
721 	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
722 	intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
723 	pciclrbme(vd->pci);
724 	return 1;
725 }
726 
727 static SDev*
viopnp(void)728 viopnp(void)
729 {
730 	SDev *s, *h, *t;
731 	Vdev *vd;
732 	int id;
733 
734 	h = t = nil;
735 
736 	id = 'F';
737 	for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
738 		if(vd->nqueue == 0)
739 			continue;
740 
741 		if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
742 			break;
743 		if((s = malloc(sizeof(*s))) == nil)
744 			break;
745 		s->ctlr = vd;
746 		s->idno = id++;
747 		s->ifc = &sdvirtio10ifc;
748 		s->nunit = 1;
749 		if(h)
750 			t->next = s;
751 		else
752 			h = s;
753 		t = s;
754 	}
755 
756 	id = '0';
757 	for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
758 		Vscsidev *scsi;
759 
760 		if(vd->nqueue < 3)
761 			continue;
762 
763 		if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
764 			break;
765 		if(scsi->max_target == 0){
766 			vunmap(scsi, sizeof(Vscsidev));
767 			continue;
768 		}
769 		if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
770 			print("sdvirtio: cdb %ud or sense size %ud too big\n",
771 				scsi->cdb_size, scsi->sense_size);
772 			vunmap(scsi, sizeof(Vscsidev));
773 			continue;
774 		}
775 		vd->dev = scsi;
776 
777 		if((s = malloc(sizeof(*s))) == nil)
778 			break;
779 		s->ctlr = vd;
780 		s->idno = id++;
781 		s->ifc = &sdvirtio10ifc;
782 		s->nunit = scsi->max_target;
783 
784 		if(h)
785 			t->next = s;
786 		else
787 			h = s;
788 		t = s;
789 	}
790 	return h;
791 }
792 
793 SDifc sdvirtio10ifc = {
794 	"virtio10",			/* name */
795 
796 	viopnp,				/* pnp */
797 	nil,				/* legacy */
798 	vioenable,			/* enable */
799 	viodisable,			/* disable */
800 
801 	vioverify,			/* verify */
802 	vioonline,			/* online */
803 	viorio,				/* rio */
804 	nil,				/* rctl */
805 	nil,				/* wctl */
806 
807 	viobio,				/* bio */
808 	nil,				/* probe */
809 	nil,				/* clear */
810 	nil,				/* rtopctl */
811 	nil,				/* wtopctl */
812 };
813