From bfae9e08be692b944ab3018d98693a15ca38a64c Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Wed, 29 Mar 2017 00:21:35 +0200 Subject: [PATCH] sdnvme: NVMe controller driver (work in progress) basic NVMe controller driver, reads and writes work. "namespaces" show up as logical units. uses pin/msi interrupts (no msi-x support yet). one submission queue per cpu, shared completion queue. no recovery from fatal controller errors. only tested in qemu (no hardware available). commiting this so it can be found by someone who has hardware. --- sys/src/9/pc/sdnvme.c | 663 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 663 insertions(+) create mode 100644 sys/src/9/pc/sdnvme.c diff --git a/sys/src/9/pc/sdnvme.c b/sys/src/9/pc/sdnvme.c new file mode 100644 index 000000000..c8605c0d2 --- /dev/null +++ b/sys/src/9/pc/sdnvme.c @@ -0,0 +1,663 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "ureg.h" +#include "../port/error.h" + +#include "../port/sd.h" + +typedef struct WS WS; +typedef struct CQ CQ; +typedef struct SQ SQ; +typedef struct Ctlr Ctlr; + +struct WS +{ + u32int cdw0; + ushort status; + Rendez *sleep; + WS **link; + SQ *queue; +}; + +struct CQ +{ + u32int head; + u32int mask; + u32int shift; + u32int *base; + Ctlr *ctlr; +}; + +struct SQ +{ + u32int tail; + u32int mask; + u32int shift; + u32int *base; + WS **wait; + Ctlr *ctlr; +}; + +struct Ctlr +{ + QLock; + + Lock intr; + u32int ints; + u32int irqc[2]; + + Pcidev *pci; + u32int *reg; + + u64int cap; + uchar *ident; + u32int *nsid; + int nnsid; + + u32int mps; /* mps = 1<sq[1+m->machno]; + } else { + qlock(ctlr); + sq = &ctlr->sq[0]; + } + ws->sleep = &up->sleep; + ws->queue = sq; + ws->link = &sq->wait[sq->tail & sq->mask]; + while(*ws->link != nil){ + sched(); + if(!adm){ + /* should be very rare */ + goto Retry; + } + } + *ws->link = ws; + + e = &sq->base[((cid = sq->tail++) & sq->mask)<<4]; + e[0] = opc | cid<<16; + e[1] = nsid; + e[2] = 0; + e[3] = 0; + if(mptr != nil){ + pa = PADDR(mptr); + e[4] = pa; + e[5] = pa>>32; + } else { + e[4] = 0; + e[5] = 0; + } + if(len > 0){ + pa = PADDR(data); + e[6] = pa; + e[7] = pa>>32; + if(len > ctlr->mps - (pa & ctlr->mps-1)) + pa += ctlr->mps - (pa & ctlr->mps-1); + else + pa = 0; + } else { + e[6] = 0; + e[7] = 0; + pa = 0; + } + e[8] = pa; + e[9] = pa>>32; + return e; +} + +static void +nvmeintr(Ureg *, void *arg) +{ + u32int phaseshift, *e; + WS *ws, **wp; + Ctlr *ctlr; + SQ *sq; + CQ *cq; + + ctlr = arg; + if(ctlr->ints == 0) + return; + + ilock(&ctlr->intr); + ctlr->reg[IntMs] = ctlr->ints; + for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){ + if(cq->base == nil) + continue; + phaseshift = 16 - cq->shift; + for(;; cq->head++){ + e = &cq->base[(cq->head & cq->mask)<<2]; + if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0) + break; + + if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n", + (int)(cq - ctlr->cq), cq->head & cq->mask, + e[0], e[1], e[2], e[3]); + + sq = &ctlr->sq[e[2] >> 16]; + wp = &sq->wait[e[3] & sq->mask]; + if((ws = *wp) != nil && ws->link == wp){ + Rendez *z = ws->sleep; + ws->cdw0 = e[0]; + ws->status = e[3]>>17; + *wp = nil; + wakeup(z); + } + } + ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = cq->head & cq->mask; + } + if((ctlr->reg[CSts] & 3) != 1) + iprint("nvmeintr: fatal controller error\n"); + ctlr->reg[IntMc] = ctlr->ints; + iunlock(&ctlr->intr); +} + +static int +wdone(void *arg) +{ + WS *ws = arg; + return *ws->link != ws; +} + +static u32int +wcmd(WS *ws) +{ + SQ *sq = ws->queue; + Ctlr *ctlr = sq->ctlr; + + coherence(); + ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask; + if(sq > ctlr->sq) { + assert(sq == &ctlr->sq[1+m->machno]); + spllo(); + } else + qunlock(sq->ctlr); + while(waserror()) + ; + tsleep(ws->sleep, wdone, ws, 5); + while(!wdone(ws)){ + nvmeintr(nil, ctlr); + tsleep(ws->sleep, wdone, ws, 10); + } + poperror(); + return ws->status; +} + +void +checkstatus(u32int status, char *info) +{ + if(status == 0) + return; + snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status); + error(up->genbuf); +} + +static long +nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba) +{ + u32int nsid, s, n, m, *e; + Ctlr *ctlr; + uchar *p; + WS ws; + + USED(lun); + + ctlr = u->dev->ctlr; + nsid = ctlr->nsid[u->subno]; + s = u->secsize; + p = a; + while(count > 0){ + m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s; + if((n = count) > m) + n = m; + e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s); + e[10] = lba; + e[11] = lba>>32; + e[12] = n-1; + e[13] = (count>n)<<6; /* sequential request */ + e[14] = 0; + e[15] = 0; + checkstatus(wcmd(&ws), write ? "write" : "read"); + p += n*s; + count -= n; + lba += n; + } + return p - (uchar*)a; +} + +static int +nvmerio(SDreq *r) +{ + int i, count, rw; + uvlong lba; + SDunit *u; + + u = r->unit; + if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91) + return sdsetsense(r, SDok, 0, 0, 0); + if((i = sdfakescsi(r)) != SDnostatus) + return r->status = i; + if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus) + return i; + r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba); + return r->status = SDok; +} + +static int +nvmeverify(SDunit *u) +{ + Ctlr *ctlr = u->dev->ctlr; + return u->subno < ctlr->nnsid; +} + +static int +nvmeonline(SDunit *u) +{ + u32int *e, lbaf; + uchar *info, *p; + Ctlr *ctlr; + WS ws; + + if(u->sectors != 0) + return 1; + + ctlr = u->dev->ctlr; + if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil) + return 0; + + e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000); + e[10] = 0; // identify namespace + if(wcmd(&ws) != 0){ + free(info); + return 0; + } + p = info; + u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24 + | (u64int)p[4]<<32 + | (u64int)p[5]<<40 + | (u64int)p[6]<<48 + | (u64int)p[7]<<56; + p = &info[128 + 4*(info[26]&15)]; + lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24; + u->secsize = 1<<((lbaf>>16)&0xFF); + free(info); + + memset(u->inquiry, 0, sizeof u->inquiry); + u->inquiry[2] = 2; + u->inquiry[3] = 2; + u->inquiry[4] = sizeof u->inquiry - 4; + memmove(u->inquiry+8, ctlr->ident+24, 20); + + return 2; +} + +static int +nvmerctl(SDunit *u, char *p, int l) +{ + Ctlr *ctlr; + char *e, *s; + + if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil) + return 0; + + e = p+l; + s = p; + + p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24); + p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4); + p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64); + p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize); + + return p-s; +} + +static void* +cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize) +{ + cq->ctlr = ctlr; + cq->head = 0; + cq->shift = lgsize-4; + cq->mask = (1<shift)-1; + if((cq->base = mallocalign(1<mps, 0, 0)) == nil) + error(Enomem); + memset(cq->base, 0, 1<base; +} + +static void* +sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize) +{ + sq->ctlr = ctlr; + sq->tail = 0; + sq->shift = lgsize-6; + sq->mask = (1<shift)-1; + if((sq->base = mallocalign(1<mps, 0, 0)) == nil) + error(Enomem); + if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil) + error(Enomem); + memset(sq->base, 0, 1<base; +} + +static void +setupqueues(Ctlr *ctlr) +{ + u32int lgsize, *e; + CQ *cq; + SQ *sq; + WS ws; + int i; + + /* Overkill */ + lgsize = 12-6+4; + while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<cq[1]; + cqalloc(ctlr, cq, lgsize); + e = qcmd(&ws, ctlr, 1, 0x05, ~0, nil, cq->base, 1<cq) | cq->mask<<16; + e[11] = 3; /* IEN | PC */ + checkstatus(wcmd(&ws), "create completion queue"); + + /* SQID[1..nmach]: submission queue per cpu */ + for(i=1; i<=conf.nmach; i++){ + sq = &ctlr->sq[i]; + sqalloc(ctlr, sq, 12); + e = qcmd(&ws, ctlr, 1, 0x01, ~0, nil, sq->base, 0x1000); + e[10] = i | sq->mask<<16; + e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */ + checkstatus(wcmd(&ws), "create submission queue"); + } + + ilock(&ctlr->intr); + ctlr->ints |= 1<<(cq - ctlr->cq); + ctlr->reg[IntMc] = ctlr->ints; + iunlock(&ctlr->intr); +} + +static void +identify(Ctlr *ctlr) +{ + u32int *e; + WS ws; + + if(ctlr->ident == nil) + if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil) + error(Enomem); + if(ctlr->nsid == nil) + if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil) + error(Enomem); + + e = qcmd(&ws, ctlr, 1, 0x06, ~0, nil, ctlr->ident, 0x1000); + e[10] = 1; // identify controller + checkstatus(wcmd(&ws), "identify controller"); + + e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000); + e[10] = 2; // namespace list + checkstatus(wcmd(&ws), "namespace list"); + + ctlr->nnsid = 0; + while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0) + ctlr->nnsid++; +} + +static int +nvmedisable(SDev *sd) +{ + char name[32]; + Ctlr *ctlr; + int i; + + ctlr = sd->ctlr; + + /* mask interrupts */ + ilock(&ctlr->intr); + ctlr->ints = 0; + ctlr->reg[IntMs] = ~ctlr->ints; + iunlock(&ctlr->intr); + + /* disable controller */ + ctlr->reg[CCfg] = 0; + + for(i = 0; i < 10; i++){ + if((ctlr->reg[CSts] & 1) == 0) + break; + tsleep(&up->sleep, return0, nil, 100); + } + + snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name); + intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name); + + pciclrbme(ctlr->pci); /* dma disable */ + + for(i=0; isq); i++){ + free(ctlr->sq[i].base); + free(ctlr->sq[i].wait); + } + for(i=0; icq); i++) + free(ctlr->cq[i].base); + + memset(ctlr->sq, 0, sizeof(ctlr->sq)); + memset(ctlr->cq, 0, sizeof(ctlr->cq)); + + free(ctlr->ident); + ctlr->ident = nil; + free(ctlr->nsid); + ctlr->nsid = nil; + ctlr->nnsid = 0; + + return 1; +} + +static int +nvmeenable(SDev *sd) +{ + char name[32]; + Ctlr *ctlr; + u64int pa; + int to; + + ctlr = sd->ctlr; + + snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name); + intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name); + + if(waserror()){ + print("%s: %s\n", name, up->errstr); + nvmedisable(sd); + sd->nunit = 0; /* hack: prevent further probing */ + return 0; + } + + pa = PADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift)); + ctlr->reg[ACQBase0] = pa; + ctlr->reg[ACQBase1] = pa>>32; + + pa = PADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift)); + ctlr->reg[ASQBase0] = pa; + ctlr->reg[ASQBase1] = pa>>32; + + ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16; + + /* dma enable */ + pcisetbme(ctlr->pci); + + /* enable interrupt */ + ilock(&ctlr->intr); + ctlr->ints = 1; + ctlr->reg[IntMc] = ctlr->ints; + iunlock(&ctlr->intr); + + /* enable controller */ + ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20; + + for(to = (ctlr->cap>>24) & 255; to >= 0; to--){ + tsleep(&up->sleep, return0, nil, 500); + if((ctlr->reg[CSts] & 3) == 1) + goto Ready; + } + if(ctlr->reg[CSts] & 2) + error("fatal controller status during initialization"); + error("controller initialization timeout"); +Ready: + identify(ctlr); + setupqueues(ctlr); + + poperror(); + + return 1; +} + +static Ctlr* +nvmepnpctlrs(void) +{ + Ctlr *ctlr, *h, *t; + Pcidev *p; + int i; + + h = t = nil; + for(p = nil; p = pcimatch(p, 0, 0);){ + if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2) + continue; + if(p->mem[0].size == 0) + continue; + if((ctlr = malloc(sizeof(*ctlr))) == nil){ + print("nvme: no memory for Ctlr\n"); + break; + } + ctlr->pci = p; + ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size); + if(ctlr->reg == nil){ + print("nvme: can't vmap bar0\n"); + Bad: + if(ctlr->reg != nil) + vunmap(ctlr->reg, p->mem[0].size); + free(ctlr); + continue; + } + ctlr->cap = ctlr->reg[Cap0]; + ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32; + + /* mask interrupts */ + ctlr->ints = 0; + ctlr->reg[IntMs] = ~ctlr->ints; + + /* disable controller */ + ctlr->reg[CCfg] = 0; + + if((ctlr->cap&(1ULL<<37)) == 0){ + print("nvme: doesnt support NVM commactlr set: %ux\n", + (u32int)(ctlr->cap>>37) & 0xFF); + goto Bad; + } + + /* use 64K page size when possible */ + ctlr->dstrd = (ctlr->cap >> 32) & 15; + for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){ + if(i >= 16-12) /* 64K */ + break; + } + ctlr->mpsshift = i+12; + ctlr->mps = 1 << ctlr->mpsshift; + + if(h == nil) + h = ctlr; + else + t->next = ctlr; + t = ctlr; + } + + return h; +} + +SDifc sdnvmeifc; + +static SDev* +nvmepnp(void) +{ + SDev *s, *h, *t; + Ctlr *ctlr; + int id; + + h = t = nil; + + id = 'N'; + for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){ + if((s = malloc(sizeof(*s))) == nil) + break; + s->ctlr = ctlr; + s->idno = id++; + s->ifc = &sdnvmeifc; + s->nunit = 1024; + if(h) + t->next = s; + else + h = s; + t = s; + } + + return h; +} + +SDifc sdnvmeifc = { + "nvme", /* name */ + + nvmepnp, /* pnp */ + nil, /* legacy */ + nvmeenable, /* enable */ + nvmedisable, /* disable */ + + nvmeverify, /* verify */ + nvmeonline, /* online */ + nvmerio, /* rio */ + nvmerctl, /* rctl */ + nil, /* wctl */ + + nvmebio, /* bio */ + nil, /* probe */ + nil, /* clear */ + nil, /* rtopctl */ + nil, /* wtopctl */ +}; -- 2.44.0