]> git.lizzy.rs Git - plan9front.git/commitdiff
virtio: add non-legacy virtio 1.0 drivers for disk and ethernet
authorcinap_lenrek <cinap_lenrek@felloff.net>
Sun, 11 Jul 2021 11:24:13 +0000 (11:24 +0000)
committercinap_lenrek <cinap_lenrek@felloff.net>
Sun, 11 Jul 2021 11:24:13 +0000 (11:24 +0000)
The new interface uses pci capability structures to locate the
registers in a rather fine granular way making it more complicated
as they can be located anywhere in any pci bar at any offset.

As far as i can see, qemu (6.0.50) never uses i/o bars in
non-legacy mode, so only mmio is implemented for now.

The previous virtio drivers implemented the legacy interface only
which uses i/o ports for all register accesses. This is still
the preferred method (and also qemu default) as it is easier to
emulate and most likely faster.

However, some vps providers like vultr force the legacy interface
to disabled with qemu -device option "disable-legacy=on" resulting
on a system without a disk and ethernet.

sys/src/9/pc/ethervirtio.c
sys/src/9/pc/ethervirtio10.c [new file with mode: 0644]
sys/src/9/pc/pc
sys/src/9/pc/sdvirtio.c
sys/src/9/pc/sdvirtio10.c [new file with mode: 0644]
sys/src/9/pc64/pc64

index 871f6d8845ebff4cec1bb4c28c4cd3eefa6b2354..db9dc2cbac7a0074235eda1e1da98b746fb3ea75 100644 (file)
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
 #include "u.h"
 #include "../port/lib.h"
 #include "mem.h"
@@ -9,11 +13,6 @@
 #include "../port/netif.h"
 #include "../port/etherif.h"
 
-/*
- * virtio ethernet driver
- * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- */
-
 typedef struct Vring Vring;
 typedef struct Vdesc Vdesc;
 typedef struct Vused Vused;
@@ -555,13 +554,14 @@ pciprobe(int typ)
        h = t = nil;
 
        /* §4.1.2 PCI Device Discovery */
-       for(p = nil; p = pcimatch(p, 0, 0);){
-               if(p->vid != 0x1AF4)
-                       continue;
+       for(p = nil; p = pcimatch(p, 0x1AF4, 0);){
                /* the two possible DIDs for virtio-net */
                if(p->did != 0x1000 && p->did != 0x1041)
                        continue;
-               /* non-transitional devices will have a revision > 0 */
+               /*
+                * non-transitional devices will have a revision > 0,
+                * these are handled by ethervirtio10 driver.
+                */
                if(p->rid != 0)
                        continue;
                /* first membar needs to be I/O */
@@ -588,6 +588,8 @@ pciprobe(int typ)
 
                /* §3.1.2 Legacy Device Initialization */
                outb(c->port+Qstatus, 0);
+               while(inb(c->port+Qstatus) != 0)
+                       delay(1);
                outb(c->port+Qstatus, Sacknowledge|Sdriver);
 
                /* negotiate feature bits */
diff --git a/sys/src/9/pc/ethervirtio10.c b/sys/src/9/pc/ethervirtio10.c
new file mode 100644 (file)
index 0000000..03108ca
--- /dev/null
@@ -0,0 +1,790 @@
+/*
+ * virtio 1.0 ethernet driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to ethervirtio.c, this driver handles the non-legacy
+ * interface for virtio ethernet which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+#include "../port/etherif.h"
+
+typedef struct Vconfig Vconfig;
+typedef struct Vnetcfg Vnetcfg;
+
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vheader Vheader;
+typedef struct Vqueue Vqueue;
+
+typedef struct Ctlr Ctlr;
+
+enum {
+       /* §2.1 Device Status Field */
+       Sacknowledge = 1,
+       Sdriver = 2,
+       Sdriverok = 4,
+       Sfeatureok = 8,
+       Sfailed = 128,
+
+       /* flags in Qnetstatus */
+       Nlinkup = (1<<0),
+       Nannounce = (1<<1),
+
+       /* feat[0] bits */
+       Fmac = 1<<5,
+       Fstatus = 1<<16,
+       Fctrlvq = 1<<17,
+       Fctrlrx = 1<<18,
+
+       /* feat[1] bits */
+       Fversion1 = 1<<(32-32),
+
+       /* vring used flags */
+       Unonotify = 1,
+       /* vring avail flags */
+       Rnointerrupt = 1,
+
+       /* descriptor flags */
+       Dnext = 1,
+       Dwrite = 2,
+       Dindirect = 4,
+
+       /* struct sizes */
+       VringSize = 4,
+       VdescSize = 16,
+       VusedSize = 8,
+       VheaderSize = 12,
+
+       Vrxq    = 0,
+       Vtxq    = 1,
+       Vctlq   = 2,
+
+       /* class/cmd for Vctlq */
+       CtrlRx  = 0x00,
+               CmdPromisc      = 0x00,
+               CmdAllmulti     = 0x01,
+       CtrlMac = 0x01,
+               CmdMacTableSet  = 0x00,
+       CtrlVlan= 0x02,
+               CmdVlanAdd      = 0x00,
+               CmdVlanDel      = 0x01,
+};
+
+struct Vconfig {
+       u32int  devfeatsel;
+       u32int  devfeat;
+       u32int  drvfeatsel;
+       u32int  drvfeat;
+
+       u16int  msixcfg;
+       u16int  nqueues;
+
+       u8int   status;
+       u8int   cfggen;
+       u16int  queuesel;
+
+       u16int  queuesize;
+       u16int  queuemsixvect;
+
+       u16int  queueenable;
+       u16int  queuenotifyoff;
+
+       u64int  queuedesc;
+       u64int  queueavail;
+       u64int  queueused;
+};
+
+struct Vnetcfg
+{
+       u16int  mac0;
+       u16int  mac1;
+       u16int  mac2;
+       u16int  status;
+       u16int  maxqueuepairs;
+       u16int  mtu;
+};
+
+struct Vring
+{
+       u16int  flags;
+       u16int  idx;
+};
+
+struct Vdesc
+{
+       u64int  addr;
+       u32int  len;
+       u16int  flags;
+       u16int  next;
+};
+
+struct Vused
+{
+       u32int  id;
+       u32int  len;
+};
+
+struct Vheader
+{
+       u8int   flags;
+       u8int   segtype;
+       u16int  hlen;
+       u16int  seglen;
+       u16int  csumstart;
+       u16int  csumend;
+};
+
+struct Vqueue
+{
+       Rendez;
+
+       uint    qsize;
+       uint    qmask;
+
+       Vdesc   *desc;
+
+       Vring   *avail;
+       u16int  *availent;
+       u16int  *availevent;
+
+       Vring   *used;
+       Vused   *usedent;
+       u16int  *usedevent;
+       u16int  lastused;
+
+       uint    nintr;
+       uint    nnote;
+
+       /* notify register */
+       void    *notify;
+};
+
+struct Ctlr {
+       Lock;
+
+       QLock   ctllock;
+
+       int     attached;
+
+       /* registers */
+       Vconfig *cfg;
+       Vnetcfg *dev;
+       u8int   *isr;
+       u8int   *notify;
+       u32int  notifyoffmult;
+
+       uvlong  port;
+       Pcidev  *pcidev;
+       Ctlr    *next;
+       int     active;
+       ulong   feat[2];
+       int     nqueue;
+
+       /* virtioether has 3 queues: rx, tx and ctl */
+       Vqueue  queue[3];
+};
+
+static Ctlr *ctlrhead;
+
+static int
+vhasroom(void *v)
+{
+       Vqueue *q = v;
+       return q->lastused != q->used->idx;
+}
+
+static void
+vqnotify(Ctlr *ctlr, int x)
+{
+       Vqueue *q;
+
+       coherence();
+       q = &ctlr->queue[x];
+       if(q->used->flags & Unonotify)
+               return;
+       q->nnote++;
+       *((u16int*)q->notify) = x;
+}
+
+static void
+txproc(void *v)
+{
+       Vheader *header;
+       Block **blocks;
+       Ether *edev;
+       Ctlr *ctlr;
+       Vqueue *q;
+       Vused *u;
+       Block *b;
+       int i, j;
+
+       edev = v;
+       ctlr = edev->ctlr;
+       q = &ctlr->queue[Vtxq];
+
+       header = smalloc(VheaderSize);
+       blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+       for(i = 0; i < q->qsize/2; i++){
+               j = i << 1;
+               q->desc[j].addr = PADDR(header);
+               q->desc[j].len = VheaderSize;
+               q->desc[j].next = j | 1;
+               q->desc[j].flags = Dnext;
+
+               q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+               j |= 1;
+               q->desc[j].next = 0;
+               q->desc[j].flags = 0;
+       }
+
+       q->avail->flags &= ~Rnointerrupt;
+
+       while(waserror())
+               ;
+
+       while((b = qbread(edev->oq, 1000000)) != nil){
+               for(;;){
+                       /* retire completed packets */
+                       while((i = q->lastused) != q->used->idx){
+                               u = &q->usedent[i & q->qmask];
+                               i = (u->id & q->qmask) >> 1;
+                               if(blocks[i] == nil)
+                                       break;
+                               freeb(blocks[i]);
+                               blocks[i] = nil;
+                               q->lastused++;
+                       }
+
+                       /* have free slot? */
+                       i = q->avail->idx & (q->qmask >> 1);
+                       if(blocks[i] == nil)
+                               break;
+
+                       /* ring full, wait and retry */
+                       if(!vhasroom(q))
+                               sleep(q, vhasroom, q);
+               }
+
+               /* slot is free, fill in descriptor */
+               blocks[i] = b;
+               j = (i << 1) | 1;
+               q->desc[j].addr = PADDR(b->rp);
+               q->desc[j].len = BLEN(b);
+               coherence();
+               q->avail->idx++;
+               vqnotify(ctlr, Vtxq);
+       }
+
+       pexit("ether out queue closed", 1);
+}
+
+static void
+rxproc(void *v)
+{
+       Vheader *header;
+       Block **blocks;
+       Ether *edev;
+       Ctlr *ctlr;
+       Vqueue *q;
+       Vused *u;
+       Block *b;
+       int i, j;
+
+       edev = v;
+       ctlr = edev->ctlr;
+       q = &ctlr->queue[Vrxq];
+
+       header = smalloc(VheaderSize);
+       blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+       for(i = 0; i < q->qsize/2; i++){
+               j = i << 1;
+               q->desc[j].addr = PADDR(header);
+               q->desc[j].len = VheaderSize;
+               q->desc[j].next = j | 1;
+               q->desc[j].flags = Dwrite|Dnext;
+
+               q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+               j |= 1;
+               q->desc[j].next = 0;
+               q->desc[j].flags = Dwrite;
+       }
+
+       q->avail->flags &= ~Rnointerrupt;
+
+       while(waserror())
+               ;
+
+       for(;;){
+               /* replenish receive ring */
+               do {
+                       i = q->avail->idx & (q->qmask >> 1);
+                       if(blocks[i] != nil)
+                               break;
+                       if((b = iallocb(ETHERMAXTU)) == nil)
+                               break;
+                       blocks[i] = b;
+                       j = (i << 1) | 1;
+                       q->desc[j].addr = PADDR(b->rp);
+                       q->desc[j].len = BALLOC(b);
+                       coherence();
+                       q->avail->idx++;
+               } while(q->avail->idx != q->used->idx);
+               vqnotify(ctlr, Vrxq);
+
+               /* wait for any packets to complete */
+               if(!vhasroom(q))
+                       sleep(q, vhasroom, q);
+
+               /* retire completed packets */
+               while((i = q->lastused) != q->used->idx) {
+                       u = &q->usedent[i & q->qmask];
+                       i = (u->id & q->qmask) >> 1;
+                       if((b = blocks[i]) == nil)
+                               break;
+
+                       blocks[i] = nil;
+                       b->wp = b->rp + u->len - VheaderSize;
+                       etheriq(edev, b);
+                       q->lastused++;
+               }
+       }
+}
+
+static int
+vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
+{
+       uchar hdr[2], ack[1];
+       Ctlr *ctlr;
+       Vqueue *q;
+       Vdesc *d;
+       int i;
+
+       ctlr = edev->ctlr;
+       q = &ctlr->queue[Vctlq];
+       if(q->qsize < 3)
+               return -1;
+
+       qlock(&ctlr->ctllock);
+       while(waserror())
+               ;
+
+       ack[0] = 0x55;
+       hdr[0] = class;
+       hdr[1] = cmd;
+
+       d = &q->desc[0];
+       d->addr = PADDR(hdr);
+       d->len = sizeof(hdr);
+       d->next = 1;
+       d->flags = Dnext;
+       d++;
+       d->addr = PADDR(data);
+       d->len = ndata;
+       d->next = 2;
+       d->flags = Dnext;
+       d++;
+       d->addr = PADDR(ack);
+       d->len = sizeof(ack);
+       d->next = 0;
+       d->flags = Dwrite;
+
+       i = q->avail->idx & q->qmask;
+       q->availent[i] = 0;
+       coherence();
+
+       q->avail->flags &= ~Rnointerrupt;
+       q->avail->idx++;
+       vqnotify(ctlr, Vctlq);
+       while(!vhasroom(q))
+               sleep(q, vhasroom, q);
+       q->lastused = q->used->idx;
+       q->avail->flags |= Rnointerrupt;
+
+       qunlock(&ctlr->ctllock);
+       poperror();
+
+       if(ack[0] != 0)
+               print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
+
+       return ack[0];
+}
+
+static void
+interrupt(Ureg*, void* arg)
+{
+       Ether *edev;
+       Ctlr *ctlr;
+       Vqueue *q;
+       int i;
+
+       edev = arg;
+       ctlr = edev->ctlr;
+       if(*ctlr->isr & 1){
+               for(i = 0; i < ctlr->nqueue; i++){
+                       q = &ctlr->queue[i];
+                       if(vhasroom(q)){
+                               q->nintr++;
+                               wakeup(q);
+                       }
+               }
+       }
+}
+
+static void
+attach(Ether* edev)
+{
+       char name[KNAMELEN];
+       Ctlr* ctlr;
+       int i;
+
+       ctlr = edev->ctlr;
+       ilock(ctlr);
+       if(ctlr->attached){
+               iunlock(ctlr);
+               return;
+       }
+       ctlr->attached = 1;
+
+       /* driver is ready */
+       ctlr->cfg->status |= Sdriverok;
+
+       /* enable the queues */
+       for(i = 0; i < ctlr->nqueue; i++){
+               ctlr->cfg->queuesel = i;
+               ctlr->cfg->queueenable = 1;
+       }
+       iunlock(ctlr);
+
+       /* start kprocs */
+       snprint(name, sizeof name, "#l%drx", edev->ctlrno);
+       kproc(name, rxproc, edev);
+       snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
+       kproc(name, txproc, edev);
+}
+
+static long
+ifstat(Ether *edev, void *a, long n, ulong offset)
+{
+       int i, l;
+       char *p;
+       Ctlr *ctlr;
+       Vqueue *q;
+
+       ctlr = edev->ctlr;
+
+       p = smalloc(READSTR);
+
+       l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
+       l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
+
+       for(i = 0; i < ctlr->nqueue; i++){
+               q = &ctlr->queue[i];
+               l += snprint(p+l, READSTR-l,
+                       "vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
+                       i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
+       }
+
+       n = readstr(offset, a, n, p);
+       free(p);
+
+       return n;
+}
+
+static void
+shutdown(Ether* edev)
+{
+       Ctlr *ctlr = edev->ctlr;
+
+       coherence();
+       ctlr->cfg->status = 0;
+       coherence();
+
+       pciclrbme(ctlr->pcidev);
+}
+
+static void
+promiscuous(void *arg, int on)
+{
+       Ether *edev = arg;
+       uchar b[1];
+
+       b[0] = on != 0;
+       vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
+}
+
+static void
+multicast(void *arg, uchar*, int)
+{
+       Ether *edev = arg;
+       uchar b[1];
+
+       b[0] = edev->nmaddr > 0;
+       vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
+}
+
+static int
+initqueue(Vqueue *q, int size)
+{
+       uchar *p;
+
+       q->desc = mallocalign(VdescSize*size, 16, 0, 0);
+       if(q->desc == nil)
+               return -1;
+       p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
+       if(p == nil){
+FreeDesc:
+               free(q->desc);
+               q->desc = nil;
+               return -1;
+       }
+       q->avail = (void*)p;
+       p += VringSize;
+       q->availent = (void*)p;
+       p += sizeof(u16int)*size;
+       q->availevent = (void*)p;
+       p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
+       if(p == nil){
+               free(q->avail);
+               q->avail = nil;
+               goto FreeDesc;
+       }
+       q->used = (void*)p;
+       p += VringSize;
+       q->usedent = (void*)p;
+       p += VusedSize*size;
+       q->usedevent = (void*)p;
+
+       q->qsize = size;
+       q->qmask = q->qsize - 1;
+
+       q->lastused = q->avail->idx = q->used->idx = 0;
+
+       q->avail->flags |= Rnointerrupt;
+
+       return 0;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+       int bar;
+
+       if(cap != 9 || pcicfgr8(p, off+3) != typ)
+               return 1;
+
+       /* skip invalid or non memory bars */
+       bar = pcicfgr8(p, off+4);
+       if(bar < 0 || bar >= nelem(p->mem) 
+       || p->mem[bar].size == 0
+       || (p->mem[bar].bar & 3) != 0)
+               return 1;
+
+       return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+       return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+       int bar, len;
+       uvlong addr;
+
+       if(cap < 0)
+               return nil;
+       bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+       addr = pcicfgr32(p, cap+8);
+       len = pcicfgr32(p, cap+12);
+       if(size <= 0)
+               size = len;
+       else if(len < size)
+               return nil;
+       if(addr+len > p->mem[bar].size)
+               return nil;
+       addr += p->mem[bar].bar & ~0xFULL;
+       return vmap(addr, size);
+}
+
+static Ctlr*
+pciprobe(void)
+{
+       Ctlr *c, *h, *t;
+       Pcidev *p;
+       Vconfig *cfg;
+       int bar, cap, n, i;
+
+       h = t = nil;
+
+       /* §4.1.2 PCI Device Discovery */
+       for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
+               /* non-transitional devices will have a revision > 0 */
+               if(p->rid == 0)
+                       continue;
+               if((cap = virtiocap(p, 1)) < 0)
+                       continue;
+               bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+               cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+               if(cfg == nil)
+                       continue;
+               if((c = mallocz(sizeof(Ctlr), 1)) == nil){
+                       print("ethervirtio: no memory for Ctlr\n");
+                       break;
+               }
+               c->cfg = cfg;
+               c->pcidev = p;
+               c->port = p->mem[bar].bar & ~0xFULL;
+
+               pcienable(p);
+               c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
+               if(c->dev == nil)
+                       goto Baddev;
+               c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+               if(c->isr == nil)
+                       goto Baddev;
+               cap = virtiocap(p, 2);
+               c->notify = virtiomapregs(p, cap, 0);
+               if(c->notify == nil)
+                       goto Baddev;
+               c->notifyoffmult = pcicfgr32(p, cap+16);
+
+               /* device reset */
+               coherence();
+               cfg->status = 0;
+               while(cfg->status != 0)
+                       delay(1);
+               cfg->status = Sacknowledge|Sdriver;
+
+               /* negotiate feature bits */
+               cfg->devfeatsel = 1;
+               c->feat[1] = cfg->devfeat;
+
+               cfg->devfeatsel = 0;
+               c->feat[0] = cfg->devfeat;
+
+               cfg->drvfeatsel = 1;
+               cfg->drvfeat = c->feat[1] & Fversion1;
+
+               cfg->drvfeatsel = 0;
+               cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
+
+               for(i=0; i<nelem(c->queue); i++){
+                       cfg->queuesel = i;
+                       n = cfg->queuesize;
+                       if(n == 0 || (n & (n-1)) != 0){
+                               if(i < 2)
+                                       print("ethervirtio: queue %d has invalid size %d\n", i, n);
+                               break;
+                       }
+                       if(initqueue(&c->queue[i], n) < 0)
+                               break;
+                       c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
+                       coherence();
+                       cfg->queuedesc = PADDR(c->queue[i].desc);
+                       cfg->queueavail = PADDR(c->queue[i].avail);
+                       cfg->queueused = PADDR(c->queue[i].used);
+               }
+               if(i < 2){
+                       print("ethervirtio: no queues\n");
+Baddev:
+                       pcidisable(p);
+                       /* TODO, vunmap */
+                       free(c);
+                       continue;
+               }
+               c->nqueue = i;          
+
+               if(h == nil)
+                       h = c;
+               else
+                       t->next = c;
+               t = c;
+       }
+
+       return h;
+}
+
+
+static int
+reset(Ether* edev)
+{
+       static uchar zeros[Eaddrlen];
+       Ctlr *ctlr;
+       int i;
+
+       if(ctlrhead == nil)
+               ctlrhead = pciprobe();
+
+       for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
+               if(ctlr->active)
+                       continue;
+               if(edev->port == 0 || edev->port == ctlr->port){
+                       ctlr->active = 1;
+                       break;
+               }
+       }
+
+       if(ctlr == nil)
+               return -1;
+
+       edev->ctlr = ctlr;
+       edev->port = ctlr->port;
+       edev->irq = ctlr->pcidev->intl;
+       edev->tbdf = ctlr->pcidev->tbdf;
+       edev->mbps = 1000;
+       edev->link = 1;
+
+       if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
+               for(i = 0; i < Eaddrlen; i++)
+                       edev->ea[i] = ((uchar*)ctlr->dev)[i];
+       } else {
+               for(i = 0; i < Eaddrlen; i++)
+                       ((uchar*)ctlr->dev)[i] = edev->ea[i];
+       }
+
+       edev->arg = edev;
+
+       edev->attach = attach;
+       edev->shutdown = shutdown;
+       edev->ifstat = ifstat;
+
+       if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
+               edev->multicast = multicast;
+               edev->promiscuous = promiscuous;
+       }
+
+       pcisetbme(ctlr->pcidev);
+       intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
+
+       return 0;
+}
+
+void
+ethervirtio10link(void)
+{
+       addethercard("virtio10", reset);
+}
index 4d242b2edbe47ede03ce3d90dc37a7e319ee74c4..9d980f9039a5bd8106f614096fe0fdfe68f851d9 100644 (file)
@@ -80,6 +80,7 @@ link
        etherwpi        pci wifi
        etherrt2860     pci wifi
        ethervirtio     pci
+       ethervirtio10   pci
        ethermedium
        pcmciamodem
        netdevmedium
@@ -108,6 +109,7 @@ misc
        sdiahci         pci sdscsi led
        sdodin          pci sdscsi led
        sdvirtio        pci sdscsi
+       sdvirtio10      pci sdscsi
        sdmmc           pci pmmc
        sdnvme          pci
        sdloop
index 4b422766717ab2ca3b638b691c1401956870d6db..c102fc03020407ce14ab4032ca9bec3a05a9ab1a 100644 (file)
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
 #include "u.h"
 #include "../port/lib.h"
 #include "mem.h"
diff --git a/sys/src/9/pc/sdvirtio10.c b/sys/src/9/pc/sdvirtio10.c
new file mode 100644 (file)
index 0000000..df25df8
--- /dev/null
@@ -0,0 +1,808 @@
+/*
+ * virtio 1.0 disk driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to sdvirtio.c, this driver handles the non-legacy
+ * interface for virtio disk which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct Vscsidev Vscsidev;
+typedef struct Vblkdev Vblkdev;
+
+typedef struct Vconfig Vconfig;
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vqueue Vqueue;
+typedef struct Vdev Vdev;
+
+
+/* device types */
+enum {
+       TypBlk  = 2,
+       TypSCSI = 8,
+};
+
+/* status flags */
+enum {
+       Acknowledge = 1,
+       Driver = 2,
+       DriverOk = 4,
+       Failed = 0x80,
+};
+
+/* descriptor flags */
+enum {
+       Next = 1,
+       Write = 2,
+       Indirect = 4,
+};
+
+/* struct sizes */
+enum {
+       VringSize = 4,
+};
+
+enum {
+       CDBSIZE         = 32,
+       SENSESIZE       = 96,
+};
+
+       
+struct Vscsidev
+{
+       u32int  num_queues;
+       u32int  seg_max;
+       u32int  max_sectors;
+       u32int  cmd_per_lun;
+       u32int  event_info_size;
+       u32int  sense_size;
+       u32int  cdb_size;
+       u16int  max_channel;
+       u16int  max_target;
+       u32int  max_lun;
+};
+
+struct Vblkdev
+{
+       u64int  capacity;
+};
+
+struct Vconfig {
+       u32int  devfeatsel;
+       u32int  devfeat;
+       u32int  drvfeatsel;
+       u32int  drvfeat;
+
+       u16int  msixcfg;
+       u16int  nqueues;
+
+       u8int   status;
+       u8int   cfggen;
+       u16int  queuesel;
+
+       u16int  queuesize;
+       u16int  queuemsixvect;
+
+       u16int  queueenable;
+       u16int  queuenotifyoff;
+
+       u64int  queuedesc;
+       u64int  queueavail;
+       u64int  queueused;
+};
+
+struct Vring
+{
+       u16int  flags;
+       u16int  idx;
+};
+
+struct Vdesc
+{
+       u64int  addr;
+       u32int  len;
+       u16int  flags;
+       u16int  next;
+};
+
+struct Vused
+{
+       u32int  id;
+       u32int  len;
+};
+
+struct Vqueue
+{
+       Lock;
+
+       Vdev    *dev;
+       void    *notify;
+       int     idx;
+
+       int     size;
+
+       int     free;
+       int     nfree;
+
+       Vdesc   *desc;
+
+       Vring   *avail;
+       u16int  *availent;
+       u16int  *availevent;
+
+       Vring   *used;
+       Vused   *usedent;
+       u16int  *usedevent;
+       u16int  lastused;
+
+       void    *rock[];
+};
+
+struct Vdev
+{
+       int     typ;
+
+       Pcidev  *pci;
+
+       uvlong  port;
+       ulong   feat[2];
+
+       int     nqueue;
+       Vqueue  *queue[16];
+
+       void    *dev;   /* device specific config (for scsi) */
+
+       /* registers */
+       Vconfig *cfg;
+       u8int   *isr;
+       u8int   *notify;
+       u32int  notifyoffmult;
+
+       Vdev    *next;
+};
+
+static Vqueue*
+mkvqueue(int size)
+{
+       Vqueue *q;
+       uchar *p;
+       int i;
+
+       q = malloc(sizeof(*q) + sizeof(void*)*size);
+       p = mallocalign(
+               PGROUND(sizeof(Vdesc)*size + 
+                       VringSize + 
+                       sizeof(u16int)*size + 
+                       sizeof(u16int)) +
+               PGROUND(VringSize + 
+                       sizeof(Vused)*size + 
+                       sizeof(u16int)), 
+               BY2PG, 0, 0);
+       if(p == nil || q == nil){
+               print("virtio: no memory for Vqueue\n");
+               free(p);
+               free(q);
+               return nil;
+       }
+
+       q->desc = (void*)p;
+       p += sizeof(Vdesc)*size;
+       q->avail = (void*)p;
+       p += VringSize;
+       q->availent = (void*)p;
+       p += sizeof(u16int)*size;
+       q->availevent = (void*)p;
+       p += sizeof(u16int);
+
+       p = (uchar*)PGROUND((uintptr)p);
+       q->used = (void*)p;
+       p += VringSize;
+       q->usedent = (void*)p;
+       p += sizeof(Vused)*size;
+       q->usedevent = (void*)p;
+
+       q->free = -1;
+       q->nfree = q->size = size;
+       for(i=0; i<size; i++){
+               q->desc[i].next = q->free;
+               q->free = i;
+       }
+
+       return q;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+       int bar;
+
+       if(cap != 9 || pcicfgr8(p, off+3) != typ)
+               return 1;
+
+       /* skip invalid or non memory bars */
+       bar = pcicfgr8(p, off+4);
+       if(bar < 0 || bar >= nelem(p->mem) 
+       || p->mem[bar].size == 0
+       || (p->mem[bar].bar & 3) != 0)
+               return 1;
+
+       return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+       return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+       int bar, len;
+       uvlong addr;
+
+       if(cap < 0)
+               return nil;
+       bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+       addr = pcicfgr32(p, cap+8);
+       len = pcicfgr32(p, cap+12);
+       if(size <= 0)
+               size = len;
+       else if(len < size)
+               return nil;
+       if(addr+len > p->mem[bar].size)
+               return nil;
+       addr += p->mem[bar].bar & ~0xFULL;
+       return vmap(addr, size);
+}
+
+static Vdev*
+viopnpdevs(int typ)
+{
+       Vdev *vd, *h, *t;
+       Vconfig *cfg;
+       Vqueue *q;
+       Pcidev *p;
+       int cap, bar;
+       int n, i;
+
+       h = t = nil;
+       for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
+               if(p->rid == 0)
+                       continue;
+               if((cap = virtiocap(p, 1)) < 0)
+                       continue;
+               bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+               cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+               if(cfg == nil)
+                       continue;
+               if((vd = malloc(sizeof(*vd))) == nil){
+                       print("virtio: no memory for Vdev\n");
+                       break;
+               }
+               vd->port = p->mem[bar].bar & ~0xFULL;
+               vd->typ = typ;
+               vd->pci = p;
+               vd->cfg = cfg;
+               pcienable(p);
+
+               vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+               if(vd->isr == nil){
+Baddev:
+                       pcidisable(p);
+                       /* TODO: vunmap */
+                       free(vd);
+                       continue;
+               }
+               cap = virtiocap(p, 2);
+               vd->notify = virtiomapregs(p, cap, 0);
+               if(vd->notify == nil)
+                       goto Baddev;
+               vd->notifyoffmult = pcicfgr32(p, cap+16);
+
+               /* reset */
+               cfg->status = 0;
+               while(cfg->status != 0)
+                       delay(1);
+               cfg->status = Acknowledge|Driver;
+
+               /* negotiate feature bits */
+               cfg->devfeatsel = 1;
+               vd->feat[1] = cfg->devfeat;
+               cfg->devfeatsel = 0;
+               vd->feat[0] = cfg->devfeat;
+               cfg->drvfeatsel = 1;
+               cfg->drvfeat = vd->feat[1] & 1;
+               cfg->drvfeatsel = 0;
+               cfg->drvfeat = 0;
+
+               for(i=0; i<nelem(vd->queue); i++){
+                       cfg->queuesel = i;
+                       n = cfg->queuesize;
+                       if(n == 0 || (n & (n-1)) != 0)
+                               break;
+                       if((q = mkvqueue(n)) == nil)
+                               break;
+                       q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
+                       q->dev = vd;
+                       q->idx = i;
+                       vd->queue[i] = q;
+                       coherence();
+                       cfg->queuedesc = PADDR(q->desc);
+                       cfg->queueavail = PADDR(q->avail);
+                       cfg->queueused = PADDR(q->used);
+               }
+               vd->nqueue = i;
+       
+               if(h == nil)
+                       h = vd;
+               else
+                       t->next = vd;
+               t = vd;
+       }
+
+       return h;
+}
+
+struct Rock {
+       int done;
+       Rendez *sleep;
+};
+
+static void
+vqinterrupt(Vqueue *q)
+{
+       int id, free, m;
+       struct Rock *r;
+       Rendez *z;
+
+       m = q->size-1;
+
+       ilock(q);
+       while((q->lastused ^ q->used->idx) & m){
+               id = q->usedent[q->lastused++ & m].id;
+               if(r = q->rock[id]){
+                       q->rock[id] = nil;
+                       z = r->sleep;
+                       r->done = 1;    /* hands off */
+                       if(z != nil)
+                               wakeup(z);
+               }
+               do {
+                       free = id;
+                       id = q->desc[free].next;
+                       q->desc[free].next = q->free;
+                       q->free = free;
+                       q->nfree++;
+               } while(q->desc[free].flags & Next);
+       }
+       iunlock(q);
+}
+
+static void
+viointerrupt(Ureg *, void *arg)
+{
+       Vdev *vd = arg;
+
+       if(vd->isr[0] & 1)
+               vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
+}
+
+static int
+viodone(void *arg)
+{
+       return ((struct Rock*)arg)->done;
+}
+
+static void
+vqio(Vqueue *q, int head)
+{
+       struct Rock rock;
+
+       rock.done = 0;
+       rock.sleep = &up->sleep;
+       q->rock[head] = &rock;
+       q->availent[q->avail->idx & (q->size-1)] = head;
+       coherence();
+       q->avail->idx++;
+       iunlock(q);
+       if((q->used->flags & 1) == 0)
+               *((u16int*)q->notify) = q->idx;
+       while(!rock.done){
+               while(waserror())
+                       ;
+               tsleep(rock.sleep, viodone, &rock, 1000);
+               poperror();
+
+               if(!rock.done)
+                       vqinterrupt(q);
+       }
+}
+
+static int
+vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
+{
+       int need, free, head;
+       Vqueue *q;
+       Vdesc *d;
+
+       u8int status;
+       struct Vioblkreqhdr {
+               u32int  typ;
+               u32int  prio;
+               u64int  lba;
+       } req;
+
+       need = 2;
+       if(a != nil)
+               need = 3;
+
+       status = -1;
+       req.typ = typ;
+       req.prio = 0;
+       req.lba = lba;
+
+       q = vd->queue[0];
+       ilock(q);
+       while(q->nfree < need){
+               iunlock(q);
+
+               if(!waserror())
+                       tsleep(&up->sleep, return0, 0, 500);
+               poperror();
+
+               ilock(q);
+       }
+
+       head = free = q->free;
+
+       d = &q->desc[free]; free = d->next;
+       d->addr = PADDR(&req);
+       d->len = sizeof(req);
+       d->flags = Next;
+
+       if(a != nil){
+               d = &q->desc[free]; free = d->next;
+               d->addr = PADDR(a);
+               d->len = secsize*count;
+               d->flags = typ ? Next : (Write|Next);
+       }
+
+       d = &q->desc[free]; free = d->next;
+       d->addr = PADDR(&status);
+       d->len = sizeof(status);
+       d->flags = Write;
+
+       q->free = free;
+       q->nfree -= need;
+
+       /* queue io, unlock and wait for completion */
+       vqio(q, head);
+
+       return status;
+}
+
+static int
+vioscsireq(SDreq *r)
+{
+       u8int resp[4+4+2+2+SENSESIZE];
+       u8int req[8+8+3+CDBSIZE];
+       int free, head;
+       u32int len;
+       Vqueue *q;
+       Vdesc *d;
+       Vdev *vd;
+       SDunit *u;
+       Vscsidev *scsi;
+
+       u = r->unit;
+       vd = u->dev->ctlr;
+       scsi = vd->dev;
+
+       memset(resp, 0, sizeof(resp));
+       memset(req, 0, sizeof(req));
+       req[0] = 1;
+       req[1] = u->subno;
+       req[2] = r->lun>>8;
+       req[3] = r->lun&0xFF;
+       *(u64int*)(&req[8]) = (uintptr)r;
+
+       memmove(&req[8+8+3], r->cmd, r->clen);
+
+       q = vd->queue[2];
+       ilock(q);
+       while(q->nfree < 3){
+               iunlock(q);
+
+               if(!waserror())
+                       tsleep(&up->sleep, return0, 0, 500);
+               poperror();
+
+               ilock(q);
+       }
+
+       head = free = q->free;
+
+       d = &q->desc[free]; free = d->next;
+       d->addr = PADDR(req);
+       d->len = 8+8+3+scsi->cdb_size;
+       d->flags = Next;
+
+       if(r->write && r->dlen > 0){
+               d = &q->desc[free]; free = d->next;
+               d->addr = PADDR(r->data);
+               d->len = r->dlen;
+               d->flags = Next;
+       }
+
+       d = &q->desc[free]; free = d->next;
+       d->addr = PADDR(resp);
+       d->len = 4+4+2+2+scsi->sense_size;
+       d->flags = Write;
+
+       if(!r->write && r->dlen > 0){
+               d->flags |= Next;
+
+               d = &q->desc[free]; free = d->next;
+               d->addr = PADDR(r->data);
+               d->len = r->dlen;
+               d->flags = Write;
+       }
+       
+       q->free = free;
+       q->nfree -= 2 + (r->dlen > 0);
+
+       /* queue io, unlock and wait for completion */
+       vqio(q, head);
+
+       /* response+status */
+       r->status = resp[10];
+       if(resp[11] != 0)
+               r->status = SDcheck;
+
+       /* sense_len */
+       len = *((u32int*)&resp[0]);
+       if(len > 0){
+               if(len > sizeof(r->sense))
+                       len = sizeof(r->sense);
+               memmove(r->sense, &resp[4+4+2+2], len);
+               r->flags |= SDvalidsense;
+       }
+
+       /* data residue */
+       len = *((u32int*)&resp[4]);
+       if(len > r->dlen)
+               r->rlen = 0;
+       else
+               r->rlen = r->dlen - len;
+
+       return r->status;
+
+}
+
+static long
+viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+       long ss, cc, max, ret;
+       Vdev *vd;
+
+       vd = u->dev->ctlr;
+       if(vd->typ == TypSCSI)
+               return scsibio(u, lun, write, a, count, lba);
+
+       max = 32;
+       ss = u->secsize;
+       ret = 0;
+       while(count > 0){
+               if((cc = count) > max)
+                       cc = max;
+               if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
+                       error(Eio);
+               ret += cc*ss;
+               count -= cc;
+               lba += cc;
+       }
+       return ret;
+}
+
+static int
+viorio(SDreq *r)
+{
+       int i, count, rw;
+       uvlong lba;
+       SDunit *u;
+       Vdev *vd;
+
+       u = r->unit;
+       vd = u->dev->ctlr;
+       if(vd->typ == TypSCSI)
+               return vioscsireq(r);
+       if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
+               if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
+                       return sdsetsense(r, SDcheck, 3, 0xc, 2);
+               return sdsetsense(r, SDok, 0, 0, 0);
+       }
+       if((i = sdfakescsi(r)) != SDnostatus)
+               return r->status = i;
+       if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+               return i;
+       r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
+       return r->status = SDok;
+}
+
+static int
+vioonline(SDunit *u)
+{
+       Vdev *vd;
+       Vblkdev *blk;
+       uvlong cap;
+
+       vd = u->dev->ctlr;
+       if(vd->typ == TypSCSI)
+               return scsionline(u);
+
+       blk = vd->dev;
+       cap = blk->capacity;
+       if(u->sectors != cap){
+               u->sectors = cap;
+               u->secsize = 512;
+               return 2;
+       }
+       return 1;
+}
+
+static int
+vioverify(SDunit *u)
+{
+       Vdev *vd;
+
+       vd = u->dev->ctlr;
+       if(vd->typ == TypSCSI)
+               return scsiverify(u);
+
+       return 1;
+}
+
+SDifc sdvirtio10ifc;
+
+static int
+vioenable(SDev *sd)
+{
+       char name[32];
+       Vdev *vd;
+       int i;
+
+       vd = sd->ctlr;
+       pcisetbme(vd->pci);
+       snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+       intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+       coherence();
+
+       vd->cfg->status |= DriverOk;
+       for(i = 0; i < vd->nqueue; i++){
+               vd->cfg->queuesel = i;
+               vd->cfg->queueenable = 1;
+       }
+
+       return 1;
+}
+
+static int
+viodisable(SDev *sd)
+{
+       char name[32];
+       Vdev *vd;
+
+       vd = sd->ctlr;
+       snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+       intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+       pciclrbme(vd->pci);
+       return 1;
+}
+
+static SDev*
+viopnp(void)
+{
+       SDev *s, *h, *t;
+       Vdev *vd;
+       int id;
+
+       h = t = nil;
+
+       id = 'F';
+       for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
+               if(vd->nqueue == 0)
+                       continue;
+
+               if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
+                       break;
+               if((s = malloc(sizeof(*s))) == nil)
+                       break;
+               s->ctlr = vd;
+               s->idno = id++;
+               s->ifc = &sdvirtio10ifc;
+               s->nunit = 1;
+               if(h)
+                       t->next = s;
+               else
+                       h = s;
+               t = s;
+       }
+
+       id = '0';
+       for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
+               Vscsidev *scsi;
+
+               if(vd->nqueue < 3)
+                       continue;
+
+               if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
+                       break;
+               if(scsi->max_target == 0){
+                       vunmap(scsi, sizeof(Vscsidev));
+                       continue;
+               }
+               if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
+                       print("sdvirtio: cdb %ud or sense size %ud too big\n",
+                               scsi->cdb_size, scsi->sense_size);
+                       vunmap(scsi, sizeof(Vscsidev));
+                       continue;
+               }
+               vd->dev = scsi;
+
+               if((s = malloc(sizeof(*s))) == nil)
+                       break;
+               s->ctlr = vd;
+               s->idno = id++;
+               s->ifc = &sdvirtio10ifc;
+               s->nunit = scsi->max_target;
+
+               if(h)
+                       t->next = s;
+               else
+                       h = s;
+               t = s;
+       }
+       return h;
+}
+
+SDifc sdvirtio10ifc = {
+       "virtio10",                     /* name */
+
+       viopnp,                         /* pnp */
+       nil,                            /* legacy */
+       vioenable,                      /* enable */
+       viodisable,                     /* disable */
+
+       vioverify,                      /* verify */
+       vioonline,                      /* online */
+       viorio,                         /* rio */
+       nil,                            /* rctl */
+       nil,                            /* wctl */
+
+       viobio,                         /* bio */
+       nil,                            /* probe */
+       nil,                            /* clear */
+       nil,                            /* rtopctl */
+       nil,                            /* wtopctl */
+};
index 3250e9a45886fd9df838b9269e0ced942241bf1d..1c9fbd172b50c50337f40f3fef3892a263c9f349 100644 (file)
@@ -78,6 +78,7 @@ link
        etherwpi        pci wifi
        etherrt2860     pci wifi
        ethervirtio     pci
+       ethervirtio10   pci
        ethermedium
 #      pcmciamodem
        netdevmedium
@@ -105,6 +106,7 @@ misc
        sdiahci         pci sdscsi led
 #      sdodin          pci sdscsi led
        sdvirtio        pci sdscsi
+       sdvirtio10      pci sdscsi
        sdmmc           pci pmmc
        sdnvme          pci
        sdloop