1 /*
2 * virtio 1.0 disk driver
3 * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
4 *
5 * In contrast to sdvirtio.c, this driver handles the non-legacy
6 * interface for virtio disk which uses mmio for all register accesses
7 * and requires a laborate pci capability structure dance to get working.
8 *
9 * It is kind of pointless as it is most likely slower than
10 * port i/o (harder to emulate on the pc platform).
11 *
12 * The reason why this driver is needed it is that vultr set the
13 * disable-legacy=on option in the -device parameter for qemu
14 * on their hypervisor.
15 */
16 #include "u.h"
17 #include "../port/lib.h"
18 #include "mem.h"
19 #include "dat.h"
20 #include "fns.h"
21 #include "io.h"
22 #include "ureg.h"
23 #include "../port/error.h"
24
25 #include "../port/sd.h"
26
27 typedef struct Vscsidev Vscsidev;
28 typedef struct Vblkdev Vblkdev;
29
30 typedef struct Vconfig Vconfig;
31 typedef struct Vring Vring;
32 typedef struct Vdesc Vdesc;
33 typedef struct Vused Vused;
34 typedef struct Vqueue Vqueue;
35 typedef struct Vdev Vdev;
36
37
38 /* device types */
39 enum {
40 TypBlk = 2,
41 TypSCSI = 8,
42 };
43
44 /* status flags */
45 enum {
46 Acknowledge = 1,
47 Driver = 2,
48 FeaturesOk = 8,
49 DriverOk = 4,
50 Failed = 0x80,
51 };
52
53 /* descriptor flags */
54 enum {
55 Next = 1,
56 Write = 2,
57 Indirect = 4,
58 };
59
60 /* struct sizes */
61 enum {
62 VringSize = 4,
63 };
64
65 enum {
66 CDBSIZE = 32,
67 SENSESIZE = 96,
68 };
69
70
71 struct Vscsidev
72 {
73 u32int num_queues;
74 u32int seg_max;
75 u32int max_sectors;
76 u32int cmd_per_lun;
77 u32int event_info_size;
78 u32int sense_size;
79 u32int cdb_size;
80 u16int max_channel;
81 u16int max_target;
82 u32int max_lun;
83 };
84
85 struct Vblkdev
86 {
87 u64int capacity;
88 };
89
90 struct Vconfig {
91 u32int devfeatsel;
92 u32int devfeat;
93 u32int drvfeatsel;
94 u32int drvfeat;
95
96 u16int msixcfg;
97 u16int nqueues;
98
99 u8int status;
100 u8int cfggen;
101 u16int queuesel;
102
103 u16int queuesize;
104 u16int queuemsixvect;
105
106 u16int queueenable;
107 u16int queuenotifyoff;
108
109 u64int queuedesc;
110 u64int queueavail;
111 u64int queueused;
112 };
113
114 struct Vring
115 {
116 u16int flags;
117 u16int idx;
118 };
119
120 struct Vdesc
121 {
122 u64int addr;
123 u32int len;
124 u16int flags;
125 u16int next;
126 };
127
128 struct Vused
129 {
130 u32int id;
131 u32int len;
132 };
133
134 struct Vqueue
135 {
136 Lock;
137
138 Vdev *dev;
139 void *notify;
140 int idx;
141
142 int size;
143
144 int free;
145 int nfree;
146
147 Vdesc *desc;
148
149 Vring *avail;
150 u16int *availent;
151 u16int *availevent;
152
153 Vring *used;
154 Vused *usedent;
155 u16int *usedevent;
156 u16int lastused;
157
158 void *rock[];
159 };
160
161 struct Vdev
162 {
163 int typ;
164
165 Pcidev *pci;
166
167 uvlong port;
168 ulong feat[2];
169
170 int nqueue;
171 Vqueue *queue[16];
172
173 void *dev; /* device specific config (for scsi) */
174
175 /* registers */
176 Vconfig *cfg;
177 u8int *isr;
178 u8int *notify;
179 u32int notifyoffmult;
180
181 Vdev *next;
182 };
183
184 static Vqueue*
mkvqueue(int size)185 mkvqueue(int size)
186 {
187 Vqueue *q;
188 uchar *p;
189 int i;
190
191 q = malloc(sizeof(*q) + sizeof(void*)*size);
192 p = mallocalign(
193 PGROUND(sizeof(Vdesc)*size +
194 VringSize +
195 sizeof(u16int)*size +
196 sizeof(u16int)) +
197 PGROUND(VringSize +
198 sizeof(Vused)*size +
199 sizeof(u16int)),
200 BY2PG, 0, 0);
201 if(p == nil || q == nil){
202 print("virtio: no memory for Vqueue\n");
203 free(p);
204 free(q);
205 return nil;
206 }
207
208 q->desc = (void*)p;
209 p += sizeof(Vdesc)*size;
210 q->avail = (void*)p;
211 p += VringSize;
212 q->availent = (void*)p;
213 p += sizeof(u16int)*size;
214 q->availevent = (void*)p;
215 p += sizeof(u16int);
216
217 p = (uchar*)PGROUND((uintptr)p);
218 q->used = (void*)p;
219 p += VringSize;
220 q->usedent = (void*)p;
221 p += sizeof(Vused)*size;
222 q->usedevent = (void*)p;
223
224 q->free = -1;
225 q->nfree = q->size = size;
226 for(i=0; i<size; i++){
227 q->desc[i].next = q->free;
228 q->free = i;
229 }
230
231 return q;
232 }
233
234 static int
matchvirtiocfgcap(Pcidev * p,int cap,int off,int typ)235 matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
236 {
237 int bar;
238
239 if(cap != 9 || pcicfgr8(p, off+3) != typ)
240 return 1;
241
242 /* skip invalid or non memory bars */
243 bar = pcicfgr8(p, off+4);
244 if(bar < 0 || bar >= nelem(p->mem)
245 || p->mem[bar].size == 0
246 || (p->mem[bar].bar & 3) != 0)
247 return 1;
248
249 return 0;
250 }
251
252 static int
virtiocap(Pcidev * p,int typ)253 virtiocap(Pcidev *p, int typ)
254 {
255 return pcienumcaps(p, matchvirtiocfgcap, typ);
256 }
257
258 static void*
virtiomapregs(Pcidev * p,int cap,int size)259 virtiomapregs(Pcidev *p, int cap, int size)
260 {
261 int bar, len;
262 uvlong addr;
263
264 if(cap < 0)
265 return nil;
266 bar = pcicfgr8(p, cap+4) % nelem(p->mem);
267 addr = pcicfgr32(p, cap+8);
268 len = pcicfgr32(p, cap+12);
269 if(size <= 0)
270 size = len;
271 else if(len < size)
272 return nil;
273 if(addr+len > p->mem[bar].size)
274 return nil;
275 addr += p->mem[bar].bar & ~0xFULL;
276 return vmap(addr, size);
277 }
278
279 static Vdev*
viopnpdevs(int typ)280 viopnpdevs(int typ)
281 {
282 Vdev *vd, *h, *t;
283 Vconfig *cfg;
284 Vqueue *q;
285 Pcidev *p;
286 int cap, bar;
287 int n, i;
288
289 h = t = nil;
290 for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
291 if(p->rid == 0)
292 continue;
293 if((cap = virtiocap(p, 1)) < 0)
294 continue;
295 bar = pcicfgr8(p, cap+4) % nelem(p->mem);
296 cfg = virtiomapregs(p, cap, sizeof(Vconfig));
297 if(cfg == nil)
298 continue;
299 if((vd = malloc(sizeof(*vd))) == nil){
300 print("virtio: no memory for Vdev\n");
301 break;
302 }
303 vd->port = p->mem[bar].bar & ~0xFULL;
304 vd->typ = typ;
305 vd->pci = p;
306 vd->cfg = cfg;
307
308 vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
309 if(vd->isr == nil){
310 Baddev:
311 /* TODO: vunmap */
312 free(vd);
313 continue;
314 }
315 cap = virtiocap(p, 2);
316 vd->notify = virtiomapregs(p, cap, 0);
317 if(vd->notify == nil)
318 goto Baddev;
319 vd->notifyoffmult = pcicfgr32(p, cap+16);
320
321 /* reset */
322 cfg->status = 0;
323 while(cfg->status != 0)
324 delay(1);
325 cfg->status = Acknowledge|Driver;
326
327 /* negotiate feature bits */
328 cfg->devfeatsel = 1;
329 vd->feat[1] = cfg->devfeat;
330 cfg->devfeatsel = 0;
331 vd->feat[0] = cfg->devfeat;
332 cfg->drvfeatsel = 1;
333 cfg->drvfeat = vd->feat[1] & 1;
334 cfg->drvfeatsel = 0;
335 cfg->drvfeat = 0;
336 cfg->status |= FeaturesOk;
337
338 for(i=0; i<nelem(vd->queue); i++){
339 cfg->queuesel = i;
340 n = cfg->queuesize;
341 if(n == 0 || (n & (n-1)) != 0)
342 break;
343 if((q = mkvqueue(n)) == nil)
344 break;
345 q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
346 q->dev = vd;
347 q->idx = i;
348 vd->queue[i] = q;
349 coherence();
350 cfg->queuedesc = PADDR(q->desc);
351 cfg->queueavail = PADDR(q->avail);
352 cfg->queueused = PADDR(q->used);
353 }
354 vd->nqueue = i;
355
356 if(h == nil)
357 h = vd;
358 else
359 t->next = vd;
360 t = vd;
361 }
362
363 return h;
364 }
365
366 struct Rock {
367 int done;
368 Rendez *sleep;
369 };
370
371 static void
vqinterrupt(Vqueue * q)372 vqinterrupt(Vqueue *q)
373 {
374 int id, free, m;
375 struct Rock *r;
376 Rendez *z;
377
378 m = q->size-1;
379
380 ilock(q);
381 while((q->lastused ^ q->used->idx) & m){
382 id = q->usedent[q->lastused++ & m].id;
383 if(r = q->rock[id]){
384 q->rock[id] = nil;
385 z = r->sleep;
386 r->done = 1; /* hands off */
387 if(z != nil)
388 wakeup(z);
389 }
390 do {
391 free = id;
392 id = q->desc[free].next;
393 q->desc[free].next = q->free;
394 q->free = free;
395 q->nfree++;
396 } while(q->desc[free].flags & Next);
397 }
398 iunlock(q);
399 }
400
401 static void
viointerrupt(Ureg *,void * arg)402 viointerrupt(Ureg *, void *arg)
403 {
404 Vdev *vd = arg;
405
406 if(vd->isr[0] & 1)
407 vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
408 }
409
410 static int
viodone(void * arg)411 viodone(void *arg)
412 {
413 return ((struct Rock*)arg)->done;
414 }
415
416 static void
vqio(Vqueue * q,int head)417 vqio(Vqueue *q, int head)
418 {
419 struct Rock rock;
420
421 rock.done = 0;
422 rock.sleep = &up->sleep;
423 q->rock[head] = &rock;
424 q->availent[q->avail->idx & (q->size-1)] = head;
425 coherence();
426 q->avail->idx++;
427 iunlock(q);
428 if((q->used->flags & 1) == 0)
429 *((u16int*)q->notify) = q->idx;
430 while(!rock.done){
431 while(waserror())
432 ;
433 tsleep(rock.sleep, viodone, &rock, 1000);
434 poperror();
435
436 if(!rock.done)
437 vqinterrupt(q);
438 }
439 }
440
441 static int
vioblkreq(Vdev * vd,int typ,void * a,long count,long secsize,uvlong lba)442 vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
443 {
444 int need, free, head;
445 Vqueue *q;
446 Vdesc *d;
447
448 u8int status;
449 struct Vioblkreqhdr {
450 u32int typ;
451 u32int prio;
452 u64int lba;
453 } req;
454
455 need = 2;
456 if(a != nil)
457 need = 3;
458
459 status = -1;
460 req.typ = typ;
461 req.prio = 0;
462 req.lba = lba;
463
464 q = vd->queue[0];
465 ilock(q);
466 while(q->nfree < need){
467 iunlock(q);
468
469 if(!waserror())
470 tsleep(&up->sleep, return0, 0, 500);
471 poperror();
472
473 ilock(q);
474 }
475
476 head = free = q->free;
477
478 d = &q->desc[free]; free = d->next;
479 d->addr = PADDR(&req);
480 d->len = sizeof(req);
481 d->flags = Next;
482
483 if(a != nil){
484 d = &q->desc[free]; free = d->next;
485 d->addr = PADDR(a);
486 d->len = secsize*count;
487 d->flags = typ ? Next : (Write|Next);
488 }
489
490 d = &q->desc[free]; free = d->next;
491 d->addr = PADDR(&status);
492 d->len = sizeof(status);
493 d->flags = Write;
494
495 q->free = free;
496 q->nfree -= need;
497
498 /* queue io, unlock and wait for completion */
499 vqio(q, head);
500
501 return status;
502 }
503
504 static int
vioscsireq(SDreq * r)505 vioscsireq(SDreq *r)
506 {
507 u8int resp[4+4+2+2+SENSESIZE];
508 u8int req[8+8+3+CDBSIZE];
509 int free, head;
510 u32int len;
511 Vqueue *q;
512 Vdesc *d;
513 Vdev *vd;
514 SDunit *u;
515 Vscsidev *scsi;
516
517 u = r->unit;
518 vd = u->dev->ctlr;
519 scsi = vd->dev;
520
521 memset(resp, 0, sizeof(resp));
522 memset(req, 0, sizeof(req));
523 req[0] = 1;
524 req[1] = u->subno;
525 req[2] = r->lun>>8;
526 req[3] = r->lun&0xFF;
527 *(u64int*)(&req[8]) = (uintptr)r;
528
529 memmove(&req[8+8+3], r->cmd, r->clen);
530
531 q = vd->queue[2];
532 ilock(q);
533 while(q->nfree < 3){
534 iunlock(q);
535
536 if(!waserror())
537 tsleep(&up->sleep, return0, 0, 500);
538 poperror();
539
540 ilock(q);
541 }
542
543 head = free = q->free;
544
545 d = &q->desc[free]; free = d->next;
546 d->addr = PADDR(req);
547 d->len = 8+8+3+scsi->cdb_size;
548 d->flags = Next;
549
550 if(r->write && r->dlen > 0){
551 d = &q->desc[free]; free = d->next;
552 d->addr = PADDR(r->data);
553 d->len = r->dlen;
554 d->flags = Next;
555 }
556
557 d = &q->desc[free]; free = d->next;
558 d->addr = PADDR(resp);
559 d->len = 4+4+2+2+scsi->sense_size;
560 d->flags = Write;
561
562 if(!r->write && r->dlen > 0){
563 d->flags |= Next;
564
565 d = &q->desc[free]; free = d->next;
566 d->addr = PADDR(r->data);
567 d->len = r->dlen;
568 d->flags = Write;
569 }
570
571 q->free = free;
572 q->nfree -= 2 + (r->dlen > 0);
573
574 /* queue io, unlock and wait for completion */
575 vqio(q, head);
576
577 /* response+status */
578 r->status = resp[10];
579 if(resp[11] != 0)
580 r->status = SDcheck;
581
582 /* sense_len */
583 len = *((u32int*)&resp[0]);
584 if(len > 0){
585 if(len > sizeof(r->sense))
586 len = sizeof(r->sense);
587 memmove(r->sense, &resp[4+4+2+2], len);
588 r->flags |= SDvalidsense;
589 }
590
591 /* data residue */
592 len = *((u32int*)&resp[4]);
593 if(len > r->dlen)
594 r->rlen = 0;
595 else
596 r->rlen = r->dlen - len;
597
598 return r->status;
599
600 }
601
602 static long
viobio(SDunit * u,int lun,int write,void * a,long count,uvlong lba)603 viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
604 {
605 long ss, cc, max, ret;
606 Vdev *vd;
607
608 vd = u->dev->ctlr;
609 if(vd->typ == TypSCSI)
610 return scsibio(u, lun, write, a, count, lba);
611
612 max = 32;
613 ss = u->secsize;
614 ret = 0;
615 while(count > 0){
616 if((cc = count) > max)
617 cc = max;
618 if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
619 error(Eio);
620 ret += cc*ss;
621 count -= cc;
622 lba += cc;
623 }
624 return ret;
625 }
626
627 enum {
628 SDread,
629 SDwrite,
630 };
631
632 static int
viorio(SDreq * r)633 viorio(SDreq *r)
634 {
635 int i, count, rw;
636 uvlong lba;
637 SDunit *u;
638 Vdev *vd;
639
640 u = r->unit;
641 vd = u->dev->ctlr;
642 if(vd->typ == TypSCSI)
643 return vioscsireq(r);
644 if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
645 if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
646 return sdsetsense(r, SDcheck, 3, 0xc, 2);
647 return sdsetsense(r, SDok, 0, 0, 0);
648 }
649 if((i = sdfakescsi(r, nil, 0)) != SDnostatus)
650 return r->status = i;
651 if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
652 return i;
653 r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
654 return r->status = SDok;
655 }
656
657 static int
vioonline(SDunit * u)658 vioonline(SDunit *u)
659 {
660 Vdev *vd;
661 Vblkdev *blk;
662 uvlong cap;
663
664 vd = u->dev->ctlr;
665 if(vd->typ == TypSCSI)
666 return scsionline(u);
667
668 blk = vd->dev;
669 cap = blk->capacity;
670 if(u->sectors != cap){
671 u->sectors = cap;
672 u->secsize = 512;
673 return 2;
674 }
675 return 1;
676 }
677
678 static int
vioverify(SDunit * u)679 vioverify(SDunit *u)
680 {
681 Vdev *vd;
682
683 vd = u->dev->ctlr;
684 if(vd->typ == TypSCSI)
685 return scsiverify(u);
686
687 return 1;
688 }
689
690 SDifc sdvirtio10ifc;
691
692 static int
vioenable(SDev * sd)693 vioenable(SDev *sd)
694 {
695 char name[32];
696 Vdev *vd;
697 int i;
698
699 vd = sd->ctlr;
700 pcisetbme(vd->pci);
701 snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
702 intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
703 coherence();
704
705 for(i = 0; i < vd->nqueue; i++){
706 vd->cfg->queuesel = i;
707 vd->cfg->queueenable = 1;
708 }
709 vd->cfg->status |= DriverOk;
710
711 return 1;
712 }
713
714 static int
viodisable(SDev * sd)715 viodisable(SDev *sd)
716 {
717 char name[32];
718 Vdev *vd;
719
720 vd = sd->ctlr;
721 snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
722 intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
723 pciclrbme(vd->pci);
724 return 1;
725 }
726
727 static SDev*
viopnp(void)728 viopnp(void)
729 {
730 SDev *s, *h, *t;
731 Vdev *vd;
732 int id;
733
734 h = t = nil;
735
736 id = 'F';
737 for(vd = viopnpdevs(TypBlk); vd; vd = vd->next){
738 if(vd->nqueue == 0)
739 continue;
740
741 if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
742 break;
743 if((s = malloc(sizeof(*s))) == nil)
744 break;
745 s->ctlr = vd;
746 s->idno = id++;
747 s->ifc = &sdvirtio10ifc;
748 s->nunit = 1;
749 if(h)
750 t->next = s;
751 else
752 h = s;
753 t = s;
754 }
755
756 id = '0';
757 for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
758 Vscsidev *scsi;
759
760 if(vd->nqueue < 3)
761 continue;
762
763 if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
764 break;
765 if(scsi->max_target == 0){
766 vunmap(scsi, sizeof(Vscsidev));
767 continue;
768 }
769 if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
770 print("sdvirtio: cdb %ud or sense size %ud too big\n",
771 scsi->cdb_size, scsi->sense_size);
772 vunmap(scsi, sizeof(Vscsidev));
773 continue;
774 }
775 vd->dev = scsi;
776
777 if((s = malloc(sizeof(*s))) == nil)
778 break;
779 s->ctlr = vd;
780 s->idno = id++;
781 s->ifc = &sdvirtio10ifc;
782 s->nunit = scsi->max_target;
783
784 if(h)
785 t->next = s;
786 else
787 h = s;
788 t = s;
789 }
790 return h;
791 }
792
793 SDifc sdvirtio10ifc = {
794 "virtio10", /* name */
795
796 viopnp, /* pnp */
797 nil, /* legacy */
798 vioenable, /* enable */
799 viodisable, /* disable */
800
801 vioverify, /* verify */
802 vioonline, /* online */
803 viorio, /* rio */
804 nil, /* rctl */
805 nil, /* wctl */
806
807 viobio, /* bio */
808 nil, /* probe */
809 nil, /* clear */
810 nil, /* rtopctl */
811 nil, /* wtopctl */
812 };
813