3 * Follows device config in Ken's file server.
4 * Builds mirrors, concatenations, interleavings, and partitions
5 * of devices out of other (inner) devices.
6 * It is ok if inner devices are provided by this driver.
8 * Built files are grouped on different directories
9 * (called trees, and used to represent disks).
10 * The "#k/fs" tree is always available and never goes away.
11 * Configuration changes happen only while no I/O is in progress.
13 * Default sector size is one byte unless changed by the "disk" ctl.
17 #include "../port/lib.h"
23 #include "../port/error.h"
26 int dec16(uchar *out, int lim, char *in, int n);
31 Fmirror, /* mirror of others */
32 Fcat, /* catenation of others */
33 Finter, /* interleaving of others */
34 Fpart, /* part of other */
35 Fclear, /* start over */
36 Fdel, /* delete a configure device */
37 Fdisk, /* set default tree and sector sz*/
38 Fcrypt, /* encrypted device */
41 Blksize = 8*1024, /* for Finter only */
43 Incr = 5, /* Increments for the dev array */
46 * All qids are decorated with the tree number.
47 * #k/fs is tree number 0, is automatically added and
48 * its first qid is for the ctl file. It never goes away.
51 Qdir, /* directory (#k/fs) */
52 Qctl, /* ctl, only for #k/fs/ctl */
53 Qfirst, /* first qid assigned for device */
61 /* tunable parameters */
62 Maxconf = 4*1024, /* max length for config */
63 Ndevs = 32, /* max. inner devs per command */
64 Ntrees = 128, /* max. number of trees */
65 Maxretries = 3, /* max. retries of i/o errors */
66 Retrypause = 5000, /* ms. to pause between retries */
69 typedef struct Inner Inner;
70 typedef struct Fsdev Fsdev;
71 typedef struct Tree Tree;
72 typedef struct Key Key;
76 char *iname; /* inner device name */
77 vlong isize; /* size of inner device */
78 Chan *idev; /* inner device */
83 Ref; /* one per Chan doing I/O */
84 int gone; /* true if removed */
85 int vers; /* qid version for this device */
86 int type; /* Fnone, Fmirror, ... */
87 char *name; /* name for this fsdev */
88 Tree* tree; /* where the device is kept */
89 vlong size; /* min(inner[X].isize) */
90 vlong start; /* start address (for Fpart) */
91 uint ndevs; /* number of inner devices */
92 Inner *inner[Ndevs]; /* inner devices */
93 void *extra; /* extra state for the device */
98 char *name; /* name for #k/<name> */
99 Fsdev **devs; /* devices in dir. */
100 uint ndevs; /* number of devices */
101 uint nadevs; /* number of allocated devices in devs */
108 #define dprint if(debug)print
110 extern Dev fsdevtab; /* forward */
112 static RWlock lck; /* r: use devices; w: change config */
113 static Tree fstree; /* The main "fs" tree. Never goes away */
114 static Tree *trees[Ntrees]; /* internal representation of config */
115 static int ntrees; /* max number of trees */
117 static char *disk; /* default tree name used */
118 static char *source; /* default inner device used */
119 static int sectorsz = Sectorsz; /* default sector size */
120 static char confstr[Maxconf]; /* textual configuration */
124 static char cfgstr[] = "fsdev:\n";
126 static Qid tqid = {Qtop, 0, QTDIR};
127 static Qid cqid = {Qctl, 0, 0};
129 static char* tnames[] = {
137 static Cmdtab configs[] = {
148 static char Egone[] = "device is gone"; /* file has been removed */
151 seprintdev(char *s, char *e, Fsdev *mp)
156 return seprint(s, e, "<null Fsdev>");
157 if(mp->type < 0 || mp->type >= nelem(tnames) || tnames[mp->type] == nil)
158 return seprint(s, e, "bad device type %d\n", mp->type);
160 s = strecpy(s, e, tnames[mp->type]);
161 if(mp->tree != &fstree)
162 s = seprint(s, e, " %s/%s", mp->tree->name, mp->name);
164 s = seprint(s, e, " %s", mp->name);
165 for(i = 0; i < mp->ndevs; i++)
166 s = seprint(s, e, " %s", mp->inner[i]->iname);
172 s = strecpy(s, e, "\n");
175 s = seprint(s, e, " %ulld %ulld\n", mp->start, mp->size);
178 panic("#k: seprintdev bug");
184 mkpath(int tree, int devno)
186 return (tree&0xFFFF)<<16 | devno&0xFFFF;
192 return q>>16 & 0xFFFF;
202 gettree(int i, int mustexist)
204 dprint("gettree %d\n", i);
206 panic("#k: bug: bad tree index %d in gettree", i);
207 if(i >= ntrees || trees[i] == nil)
216 getdev(Tree *t, int i, int mustexist)
218 dprint("getdev %d\n", i);
220 panic("#k: bug: bad dev index %d in getdev", i);
221 if(i >= t->nadevs || t->devs[i] == nil)
234 dprint("path2dev %ux\n", q);
235 t = gettree(path2treeno(q), Mustexist);
236 return getdev(t, path2devno(q) - Qfirst, Mustexist);
240 treealloc(char *name)
245 dprint("treealloc %s\n", name);
246 for(i = 0; i < nelem(trees); i++)
249 if(i == nelem(trees))
251 t = trees[i] = mallocz(sizeof(Tree), 1);
256 kstrdup(&t->name, name);
261 lookuptree(char *name)
265 dprint("lookuptree %s\n", name);
266 for(i = 0; i < ntrees; i++)
267 if(trees[i] != nil && strcmp(trees[i]->name, name) == 0)
273 devalloc(Tree *t, char *name)
278 dprint("devalloc %s %s\n", t->name, name);
279 mp = mallocz(sizeof(Fsdev), 1);
282 for(i = 0; i < t->nadevs; i++)
283 if(t->devs[i] == nil)
286 if(t->nadevs % Incr == 0){
287 ndevs = t->nadevs + Incr;
288 devs = realloc(t->devs, ndevs * sizeof(Fsdev*));
295 t->devs[t->nadevs] = nil;
298 kstrdup(&mp->name, name);
299 mp->vers = ++qidvers;
311 dprint("deltree %s\n", t->name);
312 for(i = 0; i < ntrees; i++)
314 if(i > 0){ /* "fs" never goes away */
322 panic("#k: deltree: bug: tree not found");
326 * A device is gone and we know that all its users are gone.
327 * A tree is gone when all its devices are gone ("fs" is never gone).
328 * Must close devices outside locks, so we could nest our own devices.
337 dprint("deldev %s gone %d ref %uld\n", mp->name, mp->gone, mp->ref);
340 mp->vers = ++qidvers;
344 for(i = 0; i < t->nadevs; i++)
345 if(t->devs[i] == mp){
355 for(i = 0; i < mp->ndevs; i++){
363 memset(mp, 9, sizeof *mp); /* poison */
368 * Delete one or all devices in one or all trees.
371 mdelctl(char *tname, char *dname)
373 int i, alldevs, alltrees, some;
377 dprint("delctl %s\n", dname);
378 alldevs = strcmp(dname, "*") == 0;
379 alltrees = strcmp(tname, "*") == 0;
383 for(i = 0; i < ntrees; i++){
387 if(alltrees == 0 && strcmp(t->name, tname) != 0)
389 for(i = 0; i < t->nadevs; i++){
391 if(t->devs[i] == nil)
393 if(alldevs == 0 && strcmp(mp->name, dname) != 0)
396 * Careful: must close outside locks and that
397 * may change the file tree we are looking at.
402 incref(mp); /* keep it there */
405 goto Again; /* tree can change */
410 if(some == 0 && alltrees == 0)
415 setdsize(Fsdev* mp, vlong *ilen)
421 dprint("setdsize %s\n", mp->name);
422 for (i = 0; i < mp->ndevs; i++){
428 /* truncate to multiple of Blksize */
429 inlen &= ~(Blksize-1);
433 /* use size of smallest inner device */
434 if (mp->size == 0 || mp->size > inlen)
441 if(mp->start > inlen)
442 error("partition starts after device end");
443 if(inlen < mp->start + mp->size){
444 print("#k: %s: partition truncated from "
445 "%lld to %lld bytes\n", mp->name,
446 mp->size, inlen - mp->start);
447 mp->size = inlen - mp->start;
451 if(inlen > (64*1024)) {
452 mp->size = inlen - (64 * 1024);
459 if(mp->type == Finter)
460 mp->size *= mp->ndevs;
464 validdevname(Tree *t, char *dname)
468 for(i = 0; i < t->nadevs; i++)
469 if(t->devs[i] != nil && strcmp(t->devs[i]->name, dname) == 0)
474 parseconfig(char *a, long n, Cmdbuf **cbp, Cmdtab **ctp)
479 *cbp = cb = parsecmd(a, n);
480 *ctp = ct = lookupcmd(cb, configs, nelem(configs));
482 cb->f++; /* skip command */
489 error("too few arguments for ctl");
490 if(cb->nf - 1 > Ndevs)
491 error("too many devices in ctl");
494 if(cb->nf < 1 || cb->nf > 3)
495 error("ctl usage: disk name [sz dev]");
498 if(cb->nf != 4 && (cb->nf != 3 || source == nil))
499 error("ctl usage: part new [file] off len");
503 error("ctl usage: crypt newname device keyhex");
509 parsename(char *name, char *disk, char **tree, char **dev)
513 slash = strchr(name, '/');
532 uchar buf[128]; /* old DIRLEN plus a little should be plenty */
536 l = devtab[c->type]->stat(c, buf, sizeof buf);
537 convM2D(buf, l, &d, nil);
542 * Process a single line of configuration,
543 * often of the form "cmd newname idev0 idev1".
544 * locking is tricky, because we need a write lock to
545 * add/remove devices yet adding/removing them may lead
546 * to calls to this driver that require a read lock (when
547 * inner devices are also provided by us).
550 mconfig(char* a, long n)
555 char *tname, *dname, *fakef[4];
564 /* ignore comments & empty lines */
565 if (*a == '\0' || *a == '#' || *a == '\n')
581 parseconfig(a, n, &cb, &ct);
584 kstrdup(&disk, cb->f[0]);
586 sectorsz = strtoul(cb->f[1], 0, 0);
590 kstrdup(&source, cb->f[2]);
601 mdelctl("*", "*"); /* del everything */
604 dec16(key, 32, cb->f[2], 64);
610 * got a request in the format of sd(3),
611 * pretend we got one in our format.
612 * later we change end to be len.
621 start = strtoll(cb->f[2], nil, 10);
622 size = strtoll(cb->f[3], nil, 10);
624 size -= start; /* it was end */
628 parsename(cb->f[0], disk, &tname, &dname);
629 for(i = 1; i < cb->nf; i++)
630 validname(cb->f[i], 1);
632 if(ct->index == Fdel){
633 mdelctl(tname, dname);
640 * Open all inner devices while we have only a read lock.
647 for(i = 1; i < cb->nf; i++)
648 if(idev != nil && idev[i-1] != nil)
657 idev = smalloc(sizeof(Chan*) * Ndevs);
658 ilen = smalloc(sizeof(vlong) * Ndevs);
659 for(i = 1; i < cb->nf; i++){
660 idev[i-1] = namec(cb->f[i], Aopen, ORDWR, 0);
661 ilen[i-1] = getlen(idev[i-1]);
667 * Get a write lock and add the device if we can.
675 t = lookuptree(tname);
677 validdevname(t, dname);
679 t = treealloc(tname);
681 error("no more trees");
683 mp = devalloc(t, dname);
685 if(t->ndevs == 0) /* it was created for us */
686 deltree(t); /* but we will not mdeldev() */
690 mp->type = ct->index;
691 if(mp->type == Fpart){
692 mp->start = start * sectorsz;
693 mp->size = size * sectorsz;
695 if(mp->type == Fcrypt) {
696 Key *k = mallocz(sizeof(Key), 1);
699 setupAESstate(&k->tweak, &key[0], 16, nil);
700 setupAESstate(&k->ecb, &key[16], 16, nil);
704 for(i = 1; i < cb->nf; i++){
705 inprv = mp->inner[i-1] = mallocz(sizeof(Inner), 1);
709 kstrdup(&inprv->iname, cb->f[i]);
710 inprv->idev = idev[i-1];
732 /* only read config file once */
738 /* add the std "fs" tree */
743 /* identify the config file */
744 s = getconf("fsconfig");
747 s = "/dev/sdC0/fscfg";
758 cc = namec(s, Aopen, OREAD, 0);
763 devtab[cc->type]->read(cc, confstr, sizeof confstr, 0);
767 /* validate, copy and erase config; mconfig will repopulate confstr */
768 if (strncmp(confstr, cfgstr, sizeof cfgstr - 1) != 0)
769 error("bad #k config, first line must be: 'fsdev:\\n'");
772 kstrdup(&c, confstr + sizeof cfgstr - 1);
777 memset(confstr, 0, sizeof confstr);
778 /* process config copy one line at a time */
779 for (p = c; p != nil && *p != '\0'; p = e){
790 poperror(); /* mustrd */
794 mgen(Chan *c, char*, Dirtab*, int, int i, Dir *dp)
801 dprint("mgen %#ullx %d\n", c->qid.path, i);
804 if(c->qid.path == Qtop){
806 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
809 t = gettree(i, Optional);
814 qid.path = mkpath(i, Qdir);
815 devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
819 treeno = path2treeno(c->qid.path);
820 t = gettree(treeno, Optional);
825 if((c->qid.type & QTDIR) != 0){
827 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
831 /* take care of #k/fs/ctl */
833 devdir(c, cqid, "ctl", 0, eve, 0664, dp);
838 mp = getdev(t, i, Optional);
845 qid.path = mkpath(treeno, Qfirst+i);
846 devdir(c, qid, mp->name, mp->size, eve, 0664, dp);
851 qid.path = mkpath(treeno, Qdir);
852 devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
863 return devattach(fsdevtab.dc, spec);
867 mwalk(Chan *c, Chan *nc, char **name, int nname)
873 dprint("mwalk %llux\n", c->qid.path);
879 wq = devwalk(c, nc, name, nname, 0, 0, mgen);
886 mstat(Chan *c, uchar *db, int n)
894 dprint("mstat %llux\n", c->qid.path);
901 memset(&d, 0, sizeof d);
904 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, &d);
907 devdir(c, cqid, "ctl", 0, eve, 0664, &d);
910 t = gettree(path2treeno(p), Mustexist);
911 if(c->qid.type & QTDIR)
912 devdir(c, c->qid, t->name, 0, eve, DMDIR|0775, &d);
914 mp = getdev(t, path2devno(p) - Qfirst, Mustexist);
917 devdir(c, q, mp->name, mp->size, eve, 0664, &d);
920 n = convD2M(&d, db, n);
929 mopen(Chan *c, int omode)
934 dprint("mopen %llux\n", c->qid.path);
935 if((c->qid.type & QTDIR) && omode != OREAD)
937 if(c->qid.path != Qctl && (c->qid.type&QTDIR) == 0){
952 * Our mgen does not return the info for the qid
953 * but only for its children. Don't use devopen here.
956 c->mode = openmode(omode);
967 dprint("mclose %llux\n", c->qid.path);
968 if(c->qid.type & QTDIR || !(c->flag & COPEN))
980 disk = nil; /* restore defaults */
986 if(mp->gone != 0 && mp->ref == 1)
998 io(Fsdev *mp, Inner *in, int isread, void *a, long l, vlong off)
1007 print("#k: %s: byte %,lld count %ld (of #k/%s): %s error: %s\n",
1008 in->iname, off, l, mp->name, (isread? "read": "write"),
1009 (up && up->errstr? up->errstr: ""));
1013 wl = devtab[mc->type]->read(mc, a, l, off);
1015 wl = devtab[mc->type]->write(mc, a, l, off);
1021 cryptio(Fsdev *mp, int isread, uchar *a, long l, vlong off)
1023 long wl, ws, wo, wb;
1033 if(off < 0 || l <= 0 || ((off|l) & (Sectsz-1)))
1041 off += 64*1024; // Header
1048 print("#k: %s: byte %,lld count %ld (of #k/%s): %s error: %s\n",
1049 in->iname, off, l, mp->name, (isread? "read": "write"),
1050 (up && up->errstr? up->errstr: ""));
1053 for(ws = 0; ws < l; ws += wo){
1058 wo = devtab[mc->type]->read(mc, buf, wo, off);
1062 for(wl=0; wl<wo; wl+=Sectsz)
1063 aes_xts_decrypt(k->tweak.ekey, k->ecb.dkey, off+wl, buf+wl, a+wl, Sectsz);
1065 for(wl=0; wl<wo; wl+=Sectsz)
1066 aes_xts_encrypt(k->tweak.ekey, k->ecb.ekey, off+wl, a+wl, buf+wl, Sectsz);
1067 if(devtab[mc->type]->write(mc, buf, wo, off) != wo)
1079 /* NB: a transfer could span multiple inner devices */
1081 catio(Fsdev *mp, int isread, void *a, long n, vlong off)
1088 print("catio %d %p %ld %lld\n", isread, a, n, off);
1090 for (i = 0; n > 0 && i < mp->ndevs; i++){
1092 if (off >= in->isize){
1094 continue; /* not there yet */
1096 if (off + n > in->isize)
1097 l = in->isize - off;
1101 print("\tdev %d %p %ld %lld\n", i, a, l, off);
1103 if (io(mp, in, isread, a, l, off) != l)
1111 print("\tres %ld\n", res - n);
1116 interio(Fsdev *mp, int isread, void *a, long n, vlong off)
1119 long boff, res, l, wl, wsz;
1120 vlong woff, blk, mblk;
1122 blk = off / Blksize;
1123 boff = off % Blksize;
1124 wsz = Blksize - boff;
1127 mblk = blk / mp->ndevs;
1128 i = blk % mp->ndevs;
1129 woff = mblk*Blksize + boff;
1135 wl = io(mp, mp->inner[i], isread, a, l, woff);
1149 seprintconf(char *s, char *e)
1155 for(i = 0; i < ntrees; i++){
1158 for(j = 0; j < t->nadevs; j++)
1159 if(t->devs[j] != nil)
1160 s = seprintdev(s, e, t->devs[j]);
1166 mread(Chan *c, void *a, long n, vlong off)
1173 dprint("mread %llux\n", c->qid.path);
1180 if(c->qid.type & QTDIR){
1181 res = devdirread(c, a, n, 0, 0, mgen);
1184 if(c->qid.path == Qctl){
1185 seprintconf(confstr, confstr + sizeof(confstr));
1186 res = readstr((long)off, a, n, confstr);
1190 t = gettree(path2treeno(c->qid.path), Mustexist);
1191 mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1193 if(off >= mp->size){
1197 if(off + n > mp->size)
1206 res = catio(mp, Isread, a, n, off);
1209 res = interio(mp, Isread, a, n, off);
1212 res = io(mp, mp->inner[0], Isread, a, n, mp->start + off);
1218 print("#k/%s: retry %d read for byte %,lld "
1219 "count %ld: %s\n", mp->name, retry, off,
1220 n, (up && up->errstr? up->errstr: ""));
1222 * pause before retrying in case it's due to
1223 * a transient bus or controller problem.
1225 tsleep(&up->sleep, return0, 0, Retrypause);
1227 for (i = 0; i < mp->ndevs; i++){
1230 l = io(mp, mp->inner[i], Isread, a, n, off);
1234 break; /* read a good copy */
1237 } while (i == mp->ndevs && ++retry <= Maxretries);
1238 if (retry > Maxretries) {
1239 /* no mirror had a good copy of the block */
1240 print("#k/%s: byte %,lld count %ld: CAN'T READ "
1241 "from mirror: %s\n", mp->name, off, n,
1242 (up && up->errstr? up->errstr: ""));
1244 } else if (retry > 0)
1245 print("#k/%s: byte %,lld count %ld: retry read OK "
1246 "from mirror: %s\n", mp->name, off, n,
1247 (up && up->errstr? up->errstr: ""));
1250 res = cryptio(mp, Isread, a, n, off);
1260 mwrite(Chan *c, void *a, long n, vlong off)
1262 int i, allbad, anybad, retry;
1267 dprint("mwrite %llux\n", c->qid.path);
1268 if (c->qid.type & QTDIR)
1270 if (c->qid.path == Qctl){
1281 t = gettree(path2treeno(c->qid.path), Mustexist);
1282 mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1284 if(off >= mp->size){
1288 if(off + n > mp->size)
1297 res = catio(mp, Iswrite, a, n, off);
1300 res = interio(mp, Iswrite, a, n, off);
1303 res = io(mp, mp->inner[0], Iswrite, a, n, mp->start + off);
1311 print("#k/%s: retry %d write for byte %,lld "
1312 "count %ld: %s\n", mp->name, retry, off,
1313 n, (up && up->errstr? up->errstr: ""));
1315 * pause before retrying in case it's due to
1316 * a transient bus or controller problem.
1318 tsleep(&up->sleep, return0, 0, Retrypause);
1322 for (i = mp->ndevs - 1; i >= 0; i--){
1327 l = io(mp, mp->inner[i], Iswrite, a, n, off);
1330 allbad = 0; /* wrote a good copy */
1334 } while (anybad && ++retry <= Maxretries);
1336 /* no mirror took a good copy of the block */
1337 print("#k/%s: byte %,lld count %ld: CAN'T WRITE "
1338 "to mirror: %s\n", mp->name, off, n,
1339 (up && up->errstr? up->errstr: ""));
1341 } else if (retry > 0)
1342 print("#k/%s: byte %,lld count %ld: retry wrote OK "
1343 "to mirror: %s\n", mp->name, off, n,
1344 (up && up->errstr? up->errstr: ""));
1348 res = cryptio(mp, Iswrite, a, n, off);