2 * Extraordinarily brute force Lempel & Ziv-like
3 * compressor. The input file must fit in memory
4 * during compression, and the output file will
5 * be reconstructed in memory during decompression.
6 * We search for large common sequences and use a
7 * greedy algorithm to choose which sequence gets
10 * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
12 * Output format is a series of blocks followed by
13 * a raw data section. Each block begins with a 32-bit big-endian
14 * number. The top bit is type and the next 31 bits
15 * are uncompressed size. Type is one of
16 * 0 - use raw data for this length
17 * 1 - a 32-bit offset follows
18 * After the blocks come the raw data. (The end of the blocks can be
19 * noted by summing block lengths until you reach the file length.)
34 enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
40 ulong sum32(ulong, void*, long);
51 typedef struct Node Node;
66 uint replacesame = 8*1024*1024;
68 Node *freelist, **freeend;
79 nodepool = malloc(sizeof(Node)*nnodepool);
86 assert(nnodepool > 0);
88 n = &nodepool[--nnodepool];
116 for(nhash=1; nhash<x; nhash<<=1)
118 hash = sbrk(sizeof(Node*)*nhash);
121 top = &hash[key&(nhash-1)];
123 for(l=top; *l; l=&(*l)->link){
125 if((*l)->key == key){
142 return *llookup(key);
146 insertnode(ulong key, ulong offset)
153 if(l==&hash[key&(nhash-1)])
160 /* add or replace last */
161 for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
163 n->offset[i] = offset;
167 Bputint(Biobufhdr *b, int n)
183 fprint(2, "Raw %d+%d\n", rawstart, nraw);
185 Bputint(&bout, (1<<31)|nraw);
186 memmove(odat+nodat, data+rawstart, nraw);
204 refblock(int i, int len, int off)
211 fprint(2, "Copy %d+%d from %d\n", i, len, off);
219 cmprun(uchar *a, uchar *b, int len)
225 for(i=0; i<len && a[i]==b[i]; i++)
235 for(i=0; a[i]==a[0]; i++)
243 int best, i, j, o, rle, run, maxrun, maxoff;
248 for(i=0; i<win && i<length; i++)
249 sum = (sum*256+data[i])%Prime;
250 for(i=0; i<length-win; ){
254 fprint(2, "look %.6lux\n", sum);
258 for(o=0; o<NOFF; o++){
259 if(n->offset[o] == -1)
261 run = cmprun(data+i, data+n->offset[o], length-i);
262 if(run > maxrun && n->offset[o]+mindist < i){
264 maxoff = n->offset[o];
270 for(j=best; j>0; j--)
271 n->offset[j] = n->offset[j-1];
277 j = i+refblock(i, maxrun, maxoff);
281 /* avoid huge chains from large runs of same byte */
282 rle = countrle(data+i);
285 else if(rle>maxrle[data[i]]){
286 maxrle[data[i]] = rle;
289 sum = (sum*256+data[i+win]) % Prime;
290 sum = (sum + data[i]*outn) % Prime;
293 /* could do better here */
302 fprint(2, "usage: bflz [-n winsize] [file]\n");
307 main(int argc, char **argv)
317 replacesame = atoi(EARGF(usage()));
320 mindist = atoi(EARGF(usage()));
323 win = atoi(EARGF(usage()));
337 if((fd = open(argv[0], OREAD)) < 0)
338 sysfatal("open %s: %r", argv[0]);
342 while((n = readn(fd, buf, sizeof buf)) > 0){
343 data = realloc(data, length+n);
345 sysfatal("realloc: %r");
346 memmove(data+length, buf, n);
351 odat = malloc(length);
353 sysfatal("malloc: %r");
355 Binit(&bout, 1, OWRITE);
356 Bprint(&bout, "BLZ\n");
357 Bputint(&bout, length);
360 outn = (outn * 256) % Prime;
363 fprint(2, "256^%d = %.6lux\n", win, outn);
366 fprint(2, "outn = %.6lux\n", outn);
369 Bwrite(&bout, odat, nodat);
371 fprint(2, "brk %p\n", sbrk(1));
372 fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);