2 * Index, mapping scores to log positions.
4 * The index is made up of some number of index sections, each of
5 * which is typically stored on a different disk. The blocks in all the
6 * index sections are logically numbered, with each index section
7 * responsible for a range of blocks. Blocks are typically 8kB.
9 * The N index blocks are treated as a giant hash table. The top 32 bits
10 * of score are used as the key for a lookup. Each index block holds
11 * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
13 * The index is sized so that a particular bucket is extraordinarily
14 * unlikely to overflow: assuming compressed data blocks are 4kB
15 * on disk, and assuming each block has a 40 byte index entry,
16 * the index data will be 1% of the total data. Since scores are essentially
17 * random, all buckets should be about the same fullness.
18 * A factor of 5 gives us a wide comfort boundary to account for
19 * random variation. So the index disk space should be 5% of the arena disk space.
26 static int initindex1(Index*);
27 static ISect *initisect1(ISect *is);
29 #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
31 static char IndexMagic[] = "venti index configuration";
34 initindex(char *name, ISect **sects, int n)
39 u32int last, blocksize, tabsize;
44 seterr(EOk, "no index sections to initialize index");
49 fprint(2, "no mem\n");
50 seterr(EOk, "can't initialize index: out of memory");
55 tabsize = sects[0]->tabsize;
56 if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
58 if(parseindex(&f, ix) < 0){
64 if(namecmp(ix->name, name) != 0){
65 seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
69 seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
75 blocksize = ix->blocksize;
76 for(i = 0; i < ix->nsects; i++){
78 if(namecmp(ix->name, is->index) != 0
79 || is->blocksize != blocksize
80 || is->tabsize != tabsize
81 || namecmp(is->name, ix->smap[i].name) != 0
82 || is->start != ix->smap[i].start
83 || is->stop != ix->smap[i].stop
85 || is->start > is->stop){
86 seterr(ECorrupt, "inconsistent index sections in %s", ix->name);
92 ix->tabsize = tabsize;
95 if(initindex1(ix) < 0){
100 ix->arenas = MKNZ(Arena*, ix->narenas);
101 if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
110 initindex1(Index *ix)
114 ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
115 buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
116 if(buckets != ix->buckets){
117 seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
132 seterr(EOk, "no sections in index %s", ix->name);
135 b = alloczblock(ix->tabsize, 1, ix->blocksize);
137 seterr(EOk, "can't write index configuration: out of memory");
141 if(outputindex(&f, ix) < 0){
142 seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
146 for(i = 0; i < ix->nsects; i++){
147 if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0
148 || flushpart(ix->sects[i]->part) < 0){
149 seterr(EOk, "can't write index: %r");
156 for(i = 0; i < ix->nsects; i++)
157 if(wbisect(ix->sects[i]) < 0)
164 * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
165 * version, blocksize: u32int
166 * name: max. ANameSize string
167 * sections, arenas: AMap
170 outputindex(Fmt *f, Index *ix)
172 if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
173 || outputamap(f, ix->smap, ix->nsects) < 0
174 || outputamap(f, ix->amap, ix->narenas) < 0)
180 parseindex(IFile *f, Index *ix)
190 if(s == nil || strcmp(s, IndexMagic) != 0){
191 seterr(ECorrupt, "bad index magic for %s", f->name);
198 if(ifileu32int(f, &v) < 0){
199 seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
203 if(ix->version != IndexVersion){
204 seterr(ECorrupt, "bad version number in %s", f->name);
211 if(ifilename(f, ix->name) < 0){
212 seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
219 if(ifileu32int(f, &v) < 0){
220 seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
225 if(parseamap(f, &amn) < 0)
230 if(parseamap(f, &amn) < 0)
239 * initialize an entirely new index
242 newindex(char *name, ISect **sects, int n)
247 u32int div, ub, xb, start, stop, blocksize, tabsize;
251 seterr(EOk, "creating index with no index sections");
256 * compute the total buckets available in the index,
257 * and the total buckets which are used.
260 blocksize = sects[0]->blocksize;
261 tabsize = sects[0]->tabsize;
262 for(i = 0; i < n; i++){
264 * allow index, start, and stop to be set if index is correct
265 * and start and stop are what we would have picked.
266 * this allows calling fmtindex to reformat the index after
267 * replacing a bad index section with a freshly formatted one.
268 * start and stop are checked below.
270 if(sects[i]->index[0] != '\0' && strcmp(sects[i]->index, name) != 0){
271 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
274 if(blocksize != sects[i]->blocksize){
275 seterr(EOk, "mismatched block sizes in index sections");
278 if(tabsize != sects[i]->tabsize){
279 seterr(EOk, "mismatched config table sizes in index sections");
282 nb += sects[i]->blocks;
286 * check for duplicate names
288 for(i = 0; i < n; i++){
289 for(j = i + 1; j < n; j++){
290 if(namecmp(sects[i]->name, sects[j]->name) == 0){
291 seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name);
297 if(nb >= ((u64int)1 << 32)){
298 fprint(2, "%s: index is 2^32 blocks or more; ignoring some of it\n",
300 nb = ((u64int)1 << 32) - 1;
303 div = (((u64int)1 << 32) + nb - 1) / nb;
305 fprint(2, "%s: index divisor %d too coarse; "
306 "index larger than needed, ignoring some of it\n",
309 nb = (((u64int)1 << 32) - 1) / (100 - 1);
311 ub = (((u64int)1 << 32) - 1) / div + 1;
313 seterr(EBug, "index initialization math wrong");
319 * initialize each of the index sections
320 * and the section map table
322 smap = MKNZ(AMap, n);
324 seterr(EOk, "can't create new index: out of memory");
328 for(i = 0; i < n; i++){
329 stop = start + sects[i]->blocks - xb / n;
333 if(sects[i]->start != 0 || sects[i]->stop != 0)
334 if(sects[i]->start != start || sects[i]->stop != stop){
335 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
339 sects[i]->start = start;
340 sects[i]->stop = stop;
341 namecp(sects[i]->index, name);
343 smap[i].start = start;
345 namecp(smap[i].name, sects[i]->name);
350 * initialize the index itself
354 seterr(EOk, "can't create new index: out of memory");
358 ix->version = IndexVersion;
359 namecp(ix->name, name);
363 ix->blocksize = blocksize;
365 ix->tabsize = tabsize;
368 if(initindex1(ix) < 0){
377 initisect(Part *part)
383 b = alloczblock(HeadSize, 0, 0);
384 if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
385 seterr(EAdmin, "can't read index section header: %r");
395 ok = unpackisect(is, b->data);
398 seterr(ECorrupt, "corrupted index section header: %r");
403 if(is->version != ISectVersion1 && is->version != ISectVersion2){
404 seterr(EAdmin, "unknown index section version %d", is->version);
409 return initisect1(is);
413 newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
422 namecp(is->name, name);
425 is->blocksize = blocksize;
428 tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
429 is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
430 is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
432 if(is->version == ISectVersion2){
434 is->bucketmagic = fastrand();
435 }while(is->bucketmagic==0);
445 * initialize the computed parameters for an index
448 initisect1(ISect *is)
452 is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
453 is->blocklog = u64log2(is->blocksize);
454 if(is->blocksize != (1 << is->blocklog)){
455 seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
459 partblocksize(is->part, is->blocksize);
460 is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
461 if(is->tabbase >= is->blockbase){
462 seterr(ECorrupt, "index section config table overlaps bucket storage");
466 is->tabsize = is->blockbase - is->tabbase;
467 v = is->part->size & ~(u64int)(is->blocksize - 1);
468 if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
469 seterr(ECorrupt, "invalid blocks in index section %s", is->name);
476 if(is->stop - is->start > is->blocks){
477 seterr(ECorrupt, "index section overflows available space");
481 if(is->start > is->stop){
482 seterr(ECorrupt, "invalid index section range");
495 b = alloczblock(HeadSize, 1, 0);
501 if(packisect(is, b->data) < 0){
502 seterr(ECorrupt, "can't make index section header: %r");
506 if(writepart(is->part, PartBlank, b->data, HeadSize) < 0 || flushpart(is->part) < 0){
507 seterr(EAdmin, "can't write index section header: %r");
534 for(i = 0; i < ix->nsects; i++)
535 freeisect(ix->sects[i]);
542 * write a clump to an available arena in the index
543 * and return the address of the clump within the index.
544 ZZZ question: should this distinguish between an arena
545 filling up and real errors writing the clump?
548 writeiclump(Index *ix, Clump *c, u8int *clbuf)
555 trace(TraceLump, "writeiclump enter");
557 for(i = ix->mapalloc; i < ix->narenas; i++){
558 a = writeaclump(ix->arenas[i], c, clbuf);
561 ia.addr = ix->amap[i].start + a;
562 ia.type = c->info.type;
563 ia.size = c->info.uncsize;
564 ia.blocks = (c->info.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
565 as.arena = ix->arenas[i];
567 as.stats = as.arena->memstats;
568 insertscore(c->info.score, &ia, IEDirty, &as);
569 qunlock(&ix->writing);
570 trace(TraceLump, "writeiclump exit");
574 qunlock(&ix->writing);
576 seterr(EAdmin, "no space left in arenas");
577 trace(TraceLump, "writeiclump failed");
582 * convert an arena index to an relative arena address
585 amapitoa(Index *ix, u64int a, u64int *aa)
593 if(ix->amap[m].start <= a)
600 if(a > ix->amap[l].stop){
601 for(i=0; i<ix->narenas; i++)
602 print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
603 print("want arena %d for %llux\n", l, a);
604 seterr(ECrash, "unmapped address passed to amapitoa");
608 if(ix->arenas[l] == nil){
609 seterr(ECrash, "unmapped arena selected in amapitoa");
612 *aa = a - ix->amap[l].start;
613 return ix->arenas[l];
617 * convert an arena index to the bounds of the containing arena group.
620 amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
625 arena = amapitoa(ix, a, &aa);
628 if(arenatog(arena, aa, gstart, glimit, g) < 0)
636 iaddrcmp(IAddr *ia1, IAddr *ia2)
638 return ia1->type != ia2->type
639 || ia1->size != ia2->size
640 || ia1->blocks != ia2->blocks
641 || ia1->addr != ia2->addr;
645 * lookup the score in the partition
647 * nothing needs to be explicitly locked:
648 * only static parts of ix are used, and
649 * the bucket is locked by the DBlock lock.
652 loadientry(Index *ix, u8int *score, int type, IEntry *ie)
662 trace(TraceLump, "loadientry enter");
667 qunlock(&stats.lock);
670 if(!inbloomfilter(mainindex->bloom, score)){
671 trace(TraceLump, "loadientry bloomhit");
675 trace(TraceLump, "loadientry loadibucket");
676 b = loadibucket(ix, score, &is, &buck, &ib);
677 trace(TraceLump, "loadientry loadedibucket");
681 if(okibucket(&ib, is) < 0){
682 trace(TraceLump, "loadientry badbucket");
686 h = bucklook(score, type, ib.data, ib.n);
689 trace(TraceLump, "loadientry found");
690 unpackientry(ie, &ib.data[h]);
694 trace(TraceLump, "loadientry notfound");
695 addstat(StatBloomFalseMiss, 1);
698 trace(TraceLump, "loadientry exit");
703 okibucket(IBucket *ib, ISect *is)
705 if(ib->n <= is->buckmax)
708 seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
709 ib->n, is->buckmax, is->start, is->stop);
714 * look for score within data;
715 * return 1 | byte index of matching index,
716 * or 0 | index of least element > score
719 bucklook(u8int *score, int otype, u8int *data, int n)
721 int i, r, l, m, h, c, cc, type;
726 type = vttodisktype(otype);
732 for(i = 0; i < VtScoreSize; i++){
743 cc = data[h + IEntryTypeOff];
744 if(type != cc && type != -1){
755 return l * IEntrySize;
759 * compare two IEntries; consistent with bucklook
762 ientrycmp(const void *vie1, const void *vie2)
769 for(i = 0; i < VtScoreSize; i++){
778 v1 = ie1[IEntryTypeOff];
779 v2 = ie2[IEntryTypeOff];
789 * find the number of the index section holding bucket #buck
792 indexsect0(Index *ix, u32int buck)
800 if(ix->sects[m]->start <= buck)
809 * load the index block at bucket #buck
812 loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
817 is = ix->sects[indexsect0(ix, buck)];
818 if(buck < is->start || is->stop <= buck){
819 seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
824 if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
832 unpackibucket(ib, b->data, is->bucketmagic);
837 * find the number of the index section holding score
840 indexsect1(Index *ix, u8int *score)
842 return indexsect0(ix, hashbits(score, 32) / ix->div);
846 * load the index block responsible for score.
849 loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
851 return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
855 indexsect(Index *ix, u8int *score)
857 return indexsect1(ix, score);
861 loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
863 return loadibucket1(ix, score, pis, pbuck, ib);