3 * Follows device config in Ken's file server.
4 * Builds mirrors, concatenations, interleavings, and partitions
5 * of devices out of other (inner) devices.
6 * It is ok if inner devices are provided by this driver.
8 * Built files are grouped on different directories
9 * (called trees, and used to represent disks).
10 * The "#k/fs" tree is always available and never goes away.
11 * Configuration changes happen only while no I/O is in progress.
13 * Default sector size is one byte unless changed by the "disk" ctl.
17 #include "../port/lib.h"
23 #include "../port/error.h"
24 #include "../port/sd.h"
30 Fmirror, /* mirror of others */
31 Fcat, /* catenation of others */
32 Finter, /* interleaving of others */
33 Fpart, /* part of other */
34 Fclear, /* start over */
35 Fdel, /* delete a configure device */
36 Fdisk, /* set default tree and sector sz*/
37 Fcrypt, /* encrypted device */
40 Blksize = 8*1024, /* for Finter only */
41 Cryptsectsz = 512, /* for Fcrypt only */
43 Incr = 5, /* Increments for the dev array */
46 * All qids are decorated with the tree number.
47 * #k/fs is tree number 0, is automatically added and
48 * its first qid is for the ctl file. It never goes away.
51 Qdir, /* directory (#k/fs) */
52 Qctl, /* ctl, only for #k/fs/ctl */
53 Qfirst, /* first qid assigned for device */
61 /* tunable parameters */
62 Maxconf = 4*1024, /* max length for config */
63 Ndevs = 32, /* max. inner devs per command */
64 Ntrees = 128, /* max. number of trees */
65 Maxretries = 3, /* max. retries of i/o errors */
66 Retrypause = 5000, /* ms. to pause between retries */
69 typedef struct Inner Inner;
70 typedef struct Fsdev Fsdev;
71 typedef struct Tree Tree;
72 typedef struct Key Key;
76 char *iname; /* inner device name */
77 vlong isize; /* size of inner device */
78 Chan *idev; /* inner device */
83 Ref; /* one per Chan doing I/O */
84 int gone; /* true if removed */
85 int vers; /* qid version for this device */
86 int type; /* Fnone, Fmirror, ... */
87 char *name; /* name for this fsdev */
88 Tree* tree; /* where the device is kept */
89 vlong size; /* min(inner[X].isize) */
90 vlong start; /* start address (for Fpart) */
91 uint ndevs; /* number of inner devices */
92 Inner *inner[Ndevs]; /* inner devices */
93 Key *key; /* crypt key */
98 char *name; /* name for #k/<name> */
99 Fsdev **devs; /* devices in dir. */
100 uint ndevs; /* number of devices */
101 uint nadevs; /* number of allocated devices in devs */
108 #define dprint if(debug)print
110 extern Dev fsdevtab; /* forward */
112 static RWlock lck; /* r: use devices; w: change config */
113 static Tree fstree; /* The main "fs" tree. Never goes away */
114 static Tree *trees[Ntrees]; /* internal representation of config */
115 static int ntrees; /* max number of trees */
117 static char *disk; /* default tree name used */
118 static char *source; /* default inner device used */
119 static int sectorsz = Sectorsz; /* default sector size */
120 static char confstr[Maxconf]; /* textual configuration */
124 static char cfgstr[] = "fsdev:\n";
126 static Qid tqid = {Qtop, 0, QTDIR};
127 static Qid cqid = {Qctl, 0, 0};
129 static char* tnames[] = {
137 static Cmdtab configs[] = {
148 static char Egone[] = "device is gone"; /* file has been removed */
151 seprintdev(char *s, char *e, Fsdev *mp)
156 return seprint(s, e, "<null Fsdev>");
157 if(mp->type < 0 || mp->type >= nelem(tnames) || tnames[mp->type] == nil)
158 return seprint(s, e, "bad device type %d\n", mp->type);
160 s = strecpy(s, e, tnames[mp->type]);
161 if(mp->tree != &fstree)
162 s = seprint(s, e, " %s/%s", mp->tree->name, mp->name);
164 s = seprint(s, e, " %s", mp->name);
165 for(i = 0; i < mp->ndevs; i++)
166 s = seprint(s, e, " %s", mp->inner[i]->iname);
172 s = strecpy(s, e, "\n");
175 s = seprint(s, e, " %ulld %ulld\n", mp->start, mp->size);
178 panic("#k: seprintdev bug");
184 mkpath(int tree, int devno)
186 return (tree&0xFFFF)<<16 | devno&0xFFFF;
192 return q>>16 & 0xFFFF;
202 gettree(int i, int mustexist)
204 dprint("gettree %d\n", i);
206 panic("#k: bug: bad tree index %d in gettree", i);
207 if(i >= ntrees || trees[i] == nil)
216 getdev(Tree *t, int i, int mustexist)
218 dprint("getdev %d\n", i);
220 panic("#k: bug: bad dev index %d in getdev", i);
221 if(i >= t->nadevs || t->devs[i] == nil)
234 dprint("path2dev %ux\n", q);
235 t = gettree(path2treeno(q), Mustexist);
236 return getdev(t, path2devno(q) - Qfirst, Mustexist);
240 treealloc(char *name)
245 dprint("treealloc %s\n", name);
246 for(i = 0; i < nelem(trees); i++)
249 if(i == nelem(trees))
251 t = trees[i] = mallocz(sizeof(Tree), 1);
256 kstrdup(&t->name, name);
261 lookuptree(char *name)
265 dprint("lookuptree %s\n", name);
266 for(i = 0; i < ntrees; i++)
267 if(trees[i] != nil && strcmp(trees[i]->name, name) == 0)
273 devalloc(Tree *t, char *name)
278 dprint("devalloc %s %s\n", t->name, name);
279 mp = mallocz(sizeof(Fsdev), 1);
282 for(i = 0; i < t->nadevs; i++)
283 if(t->devs[i] == nil)
286 if(t->nadevs % Incr == 0){
287 ndevs = t->nadevs + Incr;
288 devs = realloc(t->devs, ndevs * sizeof(Fsdev*));
295 t->devs[t->nadevs] = nil;
298 kstrdup(&mp->name, name);
299 mp->vers = ++qidvers;
311 dprint("deltree %s\n", t->name);
312 for(i = 0; i < ntrees; i++)
314 if(i > 0){ /* "fs" never goes away */
322 panic("#k: deltree: bug: tree not found");
326 * A device is gone and we know that all its users are gone.
327 * A tree is gone when all its devices are gone ("fs" is never gone).
328 * Must close devices outside locks, so we could nest our own devices.
337 dprint("deldev %s gone %d ref %uld\n", mp->name, mp->gone, mp->ref);
340 mp->vers = ++qidvers;
344 for(i = 0; i < t->nadevs; i++)
345 if(t->devs[i] == mp){
356 for(i = 0; i < mp->ndevs; i++){
367 * Delete one or all devices in one or all trees.
370 mdelctl(char *tname, char *dname)
372 int i, alldevs, alltrees, some;
376 dprint("delctl %s\n", dname);
377 alldevs = strcmp(dname, "*") == 0;
378 alltrees = strcmp(tname, "*") == 0;
382 for(i = 0; i < ntrees; i++){
386 if(alltrees == 0 && strcmp(t->name, tname) != 0)
388 for(i = 0; i < t->nadevs; i++){
390 if(t->devs[i] == nil)
392 if(alldevs == 0 && strcmp(mp->name, dname) != 0)
395 * Careful: must close outside locks and that
396 * may change the file tree we are looking at.
401 incref(mp); /* keep it there */
404 goto Again; /* tree can change */
409 if(some == 0 && alltrees == 0)
414 setdsize(Fsdev* mp, vlong *ilen)
420 dprint("setdsize %s\n", mp->name);
421 for (i = 0; i < mp->ndevs; i++){
427 /* truncate to multiple of Blksize */
428 inlen &= ~(Blksize-1);
432 /* use size of smallest inner device */
433 if (mp->size == 0 || mp->size > inlen)
440 if(mp->start > inlen)
441 error("partition starts after device end");
442 if(inlen < mp->start + mp->size){
443 print("#k: %s: partition truncated from "
444 "%lld to %lld bytes\n", mp->name,
445 mp->size, inlen - mp->start);
446 mp->size = inlen - mp->start;
450 if(mp->start > inlen)
451 error("crypt starts after device end");
452 mp->size = (inlen - mp->start) & ~((vlong)Cryptsectsz-1);
456 if(mp->type == Finter)
457 mp->size *= mp->ndevs;
461 validdevname(Tree *t, char *dname)
465 for(i = 0; i < t->nadevs; i++)
466 if(t->devs[i] != nil && strcmp(t->devs[i]->name, dname) == 0)
471 parseconfig(char *a, long n, Cmdbuf **cbp, Cmdtab **ctp)
476 *cbp = cb = parsecmd(a, n);
477 *ctp = ct = lookupcmd(cb, configs, nelem(configs));
479 cb->f++; /* skip command */
486 error("too few arguments for ctl");
487 if(cb->nf - 1 > Ndevs)
488 error("too many devices in ctl");
491 if(cb->nf < 1 || cb->nf > 3)
492 error("ctl usage: disk name [sz dev]");
495 if(cb->nf != 4 && (cb->nf != 3 || source == nil))
496 error("ctl usage: part new [file] off len");
500 error("ctl usage: crypt newname device keyhex");
506 parsename(char *name, char *disk, char **tree, char **dev)
510 slash = strchr(name, '/');
529 uchar buf[128]; /* old DIRLEN plus a little should be plenty */
533 l = devtab[c->type]->stat(c, buf, sizeof buf);
534 convM2D(buf, l, &d, nil);
539 * Process a single line of configuration,
540 * often of the form "cmd newname idev0 idev1".
541 * locking is tricky, because we need a write lock to
542 * add/remove devices yet adding/removing them may lead
543 * to calls to this driver that require a read lock (when
544 * inner devices are also provided by us).
547 mconfig(char* a, long n)
552 char *tname, *dname, *fakef[4];
562 /* ignore comments & empty lines */
563 if (*a == '\0' || *a == '#' || *a == '\n')
580 parseconfig(a, n, &cb, &ct);
583 kstrdup(&disk, cb->f[0]);
585 sectorsz = strtoul(cb->f[1], 0, 0);
589 kstrdup(&source, cb->f[2]);
600 mdelctl("*", "*"); /* del everything */
604 start = strtoul(cb->f[3], 0, 0);
607 start = 64*1024; /* cryptsetup header */
608 keylen = dec16(key, sizeof(key), cb->f[2], strlen(cb->f[2]));
621 * got a request in the format of sd(3),
622 * pretend we got one in our format.
623 * later we change end to be len.
632 start = strtoll(cb->f[2], nil, 10);
633 size = strtoll(cb->f[3], nil, 10);
635 size -= start; /* it was end */
639 parsename(cb->f[0], disk, &tname, &dname);
640 for(i = 1; i < cb->nf; i++)
641 validname(cb->f[i], 1);
643 if(ct->index == Fdel){
644 mdelctl(tname, dname);
651 * Open all inner devices while we have only a read lock.
658 for(i = 1; i < cb->nf; i++)
659 if(idev != nil && idev[i-1] != nil)
668 idev = smalloc(sizeof(Chan*) * Ndevs);
669 ilen = smalloc(sizeof(vlong) * Ndevs);
670 for(i = 1; i < cb->nf; i++){
671 idev[i-1] = namec(cb->f[i], Aopen, ORDWR, 0);
672 ilen[i-1] = getlen(idev[i-1]);
678 * Get a write lock and add the device if we can.
686 t = lookuptree(tname);
688 validdevname(t, dname);
690 t = treealloc(tname);
692 error("no more trees");
694 mp = devalloc(t, dname);
696 if(t->ndevs == 0) /* it was created for us */
697 deltree(t); /* but we will not mdeldev() */
701 mp->type = ct->index;
702 if(mp->type == Fpart){
703 mp->start = start * sectorsz;
704 mp->size = size * sectorsz;
706 if(mp->type == Fcrypt) {
707 Key *k = secalloc(sizeof(Key));
708 setupAESstate(&k->tweak, &key[0], keylen/2, nil);
709 setupAESstate(&k->ecb, &key[keylen/2], keylen/2, nil);
710 memset(key, 0, sizeof(key));
714 for(i = 1; i < cb->nf; i++){
715 inprv = mp->inner[i-1] = mallocz(sizeof(Inner), 1);
719 kstrdup(&inprv->iname, cb->f[i]);
720 inprv->idev = idev[i-1];
740 /* only read config file once */
746 /* add the std "fs" tree */
751 /* identify the config file */
752 s = getconf("fsconfig");
755 s = "/dev/sdC0/fscfg";
766 cc = namec(s, Aopen, OREAD, 0);
771 devtab[cc->type]->read(cc, confstr, sizeof confstr, 0);
775 /* validate, copy and erase config; mconfig will repopulate confstr */
776 if (strncmp(confstr, cfgstr, sizeof cfgstr - 1) != 0)
777 error("bad #k config, first line must be: 'fsdev:\\n'");
780 kstrdup(&c, confstr + sizeof cfgstr - 1);
785 memset(confstr, 0, sizeof confstr);
786 /* process config copy one line at a time */
787 for (p = c; p != nil && *p != '\0'; p = e){
798 poperror(); /* mustrd */
802 mgen(Chan *c, char*, Dirtab*, int, int i, Dir *dp)
809 dprint("mgen %#ullx %d\n", c->qid.path, i);
812 if(c->qid.path == Qtop){
814 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
817 t = gettree(i, Optional);
822 qid.path = mkpath(i, Qdir);
823 devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
827 treeno = path2treeno(c->qid.path);
828 t = gettree(treeno, Optional);
833 if((c->qid.type & QTDIR) != 0){
835 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, dp);
839 /* take care of #k/fs/ctl */
841 devdir(c, cqid, "ctl", 0, eve, 0664, dp);
846 mp = getdev(t, i, Optional);
853 qid.path = mkpath(treeno, Qfirst+i);
854 devdir(c, qid, mp->name, mp->size, eve, 0664, dp);
859 qid.path = mkpath(treeno, Qdir);
860 devdir(c, qid, t->name, 0, eve, DMDIR|0775, dp);
871 return devattach(fsdevtab.dc, spec);
875 mwalk(Chan *c, Chan *nc, char **name, int nname)
881 dprint("mwalk %llux\n", c->qid.path);
887 wq = devwalk(c, nc, name, nname, 0, 0, mgen);
894 mstat(Chan *c, uchar *db, int n)
902 dprint("mstat %llux\n", c->qid.path);
909 memset(&d, 0, sizeof d);
912 devdir(c, tqid, "#k", 0, eve, DMDIR|0775, &d);
915 devdir(c, cqid, "ctl", 0, eve, 0664, &d);
918 t = gettree(path2treeno(p), Mustexist);
919 if(c->qid.type & QTDIR)
920 devdir(c, c->qid, t->name, 0, eve, DMDIR|0775, &d);
922 mp = getdev(t, path2devno(p) - Qfirst, Mustexist);
925 devdir(c, q, mp->name, mp->size, eve, 0664, &d);
928 n = convD2M(&d, db, n);
937 mopen(Chan *c, int omode)
942 dprint("mopen %llux\n", c->qid.path);
943 if((c->qid.type & QTDIR) && omode != OREAD)
945 if(c->qid.path != Qctl && (c->qid.type&QTDIR) == 0){
960 * Our mgen does not return the info for the qid
961 * but only for its children. Don't use devopen here.
964 c->mode = openmode(omode);
975 dprint("mclose %llux\n", c->qid.path);
976 if(c->qid.type & QTDIR || !(c->flag & COPEN))
988 disk = nil; /* restore defaults */
994 if(mp->gone != 0 && mp->ref == 1)
1006 io(Fsdev *mp, Inner *in, int isread, void *a, long l, vlong off)
1015 print("#k: %s: byte %,lld count %ld (of #k/%s): %s error: %s\n",
1016 in->iname, off, l, mp->name, (isread? "read": "write"),
1017 (up && up->errstr? up->errstr: ""));
1021 wl = devtab[mc->type]->read(mc, a, l, off);
1023 wl = devtab[mc->type]->write(mc, a, l, off);
1029 cryptio(Fsdev *mp, int isread, uchar *a, long n, vlong off)
1034 if((((ulong)off|n) & (Cryptsectsz-1)))
1037 l = io(mp, mp->inner[0], Isread, a, n, off);
1039 l &= ~(Cryptsectsz-1);
1040 for(o=0; o<l; o+=Cryptsectsz)
1041 aes_xts_decrypt(&mp->key->tweak, &mp->key->ecb,
1042 off+o, a+o, a+o, Cryptsectsz);
1046 nb = n < SDmaxio ? n : SDmaxio;
1047 while((b = sdmalloc(nb)) == nil){
1049 resrcwait("no memory for cryptio");
1057 for(l = 0; (m = n - l) > 0; l += m){
1059 for(o=0; o<m; o+=Cryptsectsz)
1060 aes_xts_encrypt(&mp->key->tweak, &mp->key->ecb,
1061 off+o, a+o, b+o, Cryptsectsz);
1062 if(io(mp, mp->inner[0], Iswrite, b, m, off) != m)
1072 /* NB: a transfer could span multiple inner devices */
1074 catio(Fsdev *mp, int isread, void *a, long n, vlong off)
1081 print("catio %d %p %ld %lld\n", isread, a, n, off);
1083 for (i = 0; n > 0 && i < mp->ndevs; i++){
1085 if (off >= in->isize){
1087 continue; /* not there yet */
1089 if (off + n > in->isize)
1090 l = in->isize - off;
1094 print("\tdev %d %p %ld %lld\n", i, a, l, off);
1096 if (io(mp, in, isread, a, l, off) != l)
1104 print("\tres %ld\n", res - n);
1109 interio(Fsdev *mp, int isread, void *a, long n, vlong off)
1112 long boff, res, l, wl, wsz;
1113 vlong woff, blk, mblk;
1115 blk = off / Blksize;
1116 boff = off % Blksize;
1117 wsz = Blksize - boff;
1120 mblk = blk / mp->ndevs;
1121 i = blk % mp->ndevs;
1122 woff = mblk*Blksize + boff;
1128 wl = io(mp, mp->inner[i], isread, a, l, woff);
1142 seprintconf(char *s, char *e)
1148 for(i = 0; i < ntrees; i++){
1151 for(j = 0; j < t->nadevs; j++)
1152 if(t->devs[j] != nil)
1153 s = seprintdev(s, e, t->devs[j]);
1159 mread(Chan *c, void *a, long n, vlong off)
1166 dprint("mread %llux\n", c->qid.path);
1173 if(c->qid.type & QTDIR){
1174 res = devdirread(c, a, n, 0, 0, mgen);
1177 if(c->qid.path == Qctl){
1178 seprintconf(confstr, confstr + sizeof(confstr));
1179 res = readstr((long)off, a, n, confstr);
1183 t = gettree(path2treeno(c->qid.path), Mustexist);
1184 mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1186 if(off >= mp->size){
1190 if(off + n > mp->size)
1199 res = catio(mp, Isread, a, n, off);
1202 res = interio(mp, Isread, a, n, off);
1205 res = io(mp, mp->inner[0], Isread, a, n, mp->start + off);
1211 print("#k/%s: retry %d read for byte %,lld "
1212 "count %ld: %s\n", mp->name, retry, off,
1213 n, (up && up->errstr? up->errstr: ""));
1215 * pause before retrying in case it's due to
1216 * a transient bus or controller problem.
1218 tsleep(&up->sleep, return0, 0, Retrypause);
1220 for (i = 0; i < mp->ndevs; i++){
1223 l = io(mp, mp->inner[i], Isread, a, n, off);
1227 break; /* read a good copy */
1230 } while (i == mp->ndevs && ++retry <= Maxretries);
1231 if (retry > Maxretries) {
1232 /* no mirror had a good copy of the block */
1233 print("#k/%s: byte %,lld count %ld: CAN'T READ "
1234 "from mirror: %s\n", mp->name, off, n,
1235 (up && up->errstr? up->errstr: ""));
1237 } else if (retry > 0)
1238 print("#k/%s: byte %,lld count %ld: retry read OK "
1239 "from mirror: %s\n", mp->name, off, n,
1240 (up && up->errstr? up->errstr: ""));
1243 res = cryptio(mp, Isread, a, n, mp->start + off);
1253 mwrite(Chan *c, void *a, long n, vlong off)
1255 int i, allbad, anybad, retry;
1260 dprint("mwrite %llux\n", c->qid.path);
1261 if (c->qid.type & QTDIR)
1263 if (c->qid.path == Qctl){
1274 t = gettree(path2treeno(c->qid.path), Mustexist);
1275 mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);
1277 if(off >= mp->size){
1281 if(off + n > mp->size)
1290 res = catio(mp, Iswrite, a, n, off);
1293 res = interio(mp, Iswrite, a, n, off);
1296 res = io(mp, mp->inner[0], Iswrite, a, n, mp->start + off);
1304 print("#k/%s: retry %d write for byte %,lld "
1305 "count %ld: %s\n", mp->name, retry, off,
1306 n, (up && up->errstr? up->errstr: ""));
1308 * pause before retrying in case it's due to
1309 * a transient bus or controller problem.
1311 tsleep(&up->sleep, return0, 0, Retrypause);
1315 for (i = mp->ndevs - 1; i >= 0; i--){
1320 l = io(mp, mp->inner[i], Iswrite, a, n, off);
1323 allbad = 0; /* wrote a good copy */
1327 } while (anybad && ++retry <= Maxretries);
1329 /* no mirror took a good copy of the block */
1330 print("#k/%s: byte %,lld count %ld: CAN'T WRITE "
1331 "to mirror: %s\n", mp->name, off, n,
1332 (up && up->errstr? up->errstr: ""));
1334 } else if (retry > 0)
1335 print("#k/%s: byte %,lld count %ld: retry wrote OK "
1336 "to mirror: %s\n", mp->name, off, n,
1337 (up && up->errstr? up->errstr: ""));
1341 res = cryptio(mp, Iswrite, a, n, mp->start + off);