7 00/ff for end of file can conflict with 00/ff characters
12 Nline = 100000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
16 Bflag = 1<<0, /* flags per field */
28 NSstart = 0, /* states for number to key decoding */
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
95 long mline; /* max lines per file */
98 extern int latinmap[];
99 extern Rune* month[12];
101 void buildkey(Line*);
102 void doargs(int, char*[]);
103 void dofield(char*, int*, int*, int, int);
104 void dofile(Biobuf*);
105 void dokey_(Key*, uchar*, uchar*, Field*);
106 void dokey_dfi(Key*, uchar*, uchar*, Field*);
107 void dokey_gn(Key*, uchar*, uchar*, Field*);
108 void dokey_m(Key*, uchar*, uchar*, Field*);
109 void dokey_r(Key*, uchar*, uchar*, Field*);
111 void* emalloc(ulong);
112 void* erealloc(void*, ulong);
113 int kcmp(Key*, Key*);
114 void makemapd(Field*);
115 void makemapm(Field*);
116 void mergefiles(int, int, Biobuf*);
117 void mergeout(Biobuf*);
119 Line* newline(Biobuf*);
120 void notifyf(void*, char*);
121 void printargs(void);
122 void printout(Biobuf*);
123 void setfield(int, int);
124 uchar* skip(uchar*, int, int, int, int);
125 void sort4(void*, ulong);
128 void lineout(Biobuf*, Line*);
131 main(int argc, char *argv[])
137 notify(notifyf); /**/
142 for(i=1; i<argc; i++) {
143 if((s = argv[i]) == nil)
145 if(strcmp(s, "-") == 0) {
146 Binit(&bbuf, 0, OREAD);
153 fprint(2, "sort: open %s: %r\n", s);
156 Binit(&bbuf, f, OREAD);
161 if(args.nfile == 0) {
162 Binit(&bbuf, 0, OREAD);
169 fprint(2, "=========\n");
173 f = create(args.ofile, OWRITE, 0666);
175 fprint(2, "sort: create %s: %r\n", args.ofile);
180 Binit(&bbuf, f, OWRITE);
198 if((ol = newline(b)) == nil)
201 if((l = newline(b)) == nil)
203 n = kcmp(ol->key, l->key);
204 if(n > 0 || (n == 0 && args.uflag)) {
205 fprint(2, "sort: -c file not in sort\n"); /**/
215 if(args.linep == nil)
216 args.linep = emalloc(args.mline * sizeof(args.linep));
218 if((l = newline(b)) == nil)
220 if(args.nline >= args.mline)
222 args.linep[args.nline] = l;
229 notifyf(void*, char *s)
232 if(strcmp(s, "interrupt") == 0)
234 if(strcmp(s, "hangup") == 0)
236 if(strcmp(s, "kill") == 0)
238 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
240 fprint(2, "sort: note: %s\n", s);
251 p = Brdline(b, '\n');
259 l = erealloc(l, sizeof(Line) +
260 (n+31)*sizeof(l->line[0]));
263 fprint(2, "sort: newline added\n");
267 sysfatal("bug: l == nil");
276 l = emalloc(sizeof(Line) + (n-1)*sizeof(l->line[0]));
278 memmove(l->line, p, n);
284 lineout(Biobuf *b, Line *l)
289 m = Bwrite(b, l->line, n);
303 sort4(args.linep, args.nline);
304 tf = tempfile(args.ntemp);
306 f = create(tf, OWRITE, 0666);
308 fprint(2, "sort: create %s: %r\n", tf);
312 Binit(&tb, f, OWRITE);
314 for(n=args.nline; n>0; n--) {
330 for(i=0; i<args.ntemp; i++)
336 erealloc(void *v, ulong n)
338 if((v = realloc(v, n)) == nil && n != 0){
339 fprint(2, "realloc: %r\n");
351 if((v = malloc(n)) == nil){
352 fprint(2, "malloc: %r\n");
363 static char file[100];
370 if(strlen(dir) >= nelem(file)-20) {
371 fprint(2, "temp file directory name is too long: %s\n", dir);
384 snprint(file, sizeof(file), "%s/sort.%.4d.%.4d", dir, pid%10000, n);
395 for(i=0; i<args.ntemp; i+=n) {
398 tf = tempfile(args.ntemp);
400 f = create(tf, OWRITE, 0666);
402 fprint(2, "sort: create %s: %r\n", tf);
405 Binit(&tb, f, OWRITE);
408 mergefiles(i, n, &tb);
418 mergefiles(int t, int n, Biobuf *b)
420 Merge *m, *mp, **mmp;
426 mmp = emalloc(n*sizeof(*mmp));
427 mp = emalloc(n*sizeof(*mp));
431 for(i=0; i<n; i++,m++) {
435 fprint(2, "sort: reopen %s: %r\n", tf);
439 Binit(&m->b, f, OREAD);
442 if((l = newline(&m->b)) == nil)
457 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
476 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
484 for(i=0; i<n; i++,m++) {
494 kcmp(Key *ka, Key *kb)
499 * set n to length of smaller key
505 return memcmp(ka->key, kb->key, n);
515 sort4(args.linep, args.nline);
518 for(n=args.nline; n>0; n--) {
520 if(args.uflag && ok && kcmp(ok, l->key) == 0)
528 setfield(int n, int c)
535 fprint(2, "sort: unknown option: field.%C\n", c);
537 case 'b': /* skip blanks */
540 case 'd': /* directory order */
543 case 'f': /* fold case */
546 case 'g': /* floating point -n case */
549 case 'i': /* ignore non-ascii */
552 case 'M': /* month */
555 case 'n': /* numbers */
558 case 'r': /* reverse */
561 case 'w': /* ignore white */
568 dofield(char *s, int *n1, int *n2, int off1, int off2)
573 if(c >= '0' && c <= '9') {
575 while(c >= '0' && c <= '9') {
579 n -= off1; /* posix committee: rot in hell */
581 fprint(2, "sort: field offset must be positive\n");
588 if(c >= '0' && c <= '9') {
590 while(c >= '0' && c <= '9') {
596 fprint(2, "sort: character offset must be positive\n");
603 setfield(args.nfield, c);
616 for(i=0; i<=args.nfield; i++) {
622 fprint(2, " +%d", n);
631 if(f->flags & B1flag)
636 fprint(2, " -%d", n);
647 fprint(2, "%sb", prefix);
649 fprint(2, "%sd", prefix);
651 fprint(2, "%sf", prefix);
653 fprint(2, "%sg", prefix);
655 fprint(2, "%si", prefix);
657 fprint(2, "%sM", prefix);
659 fprint(2, "%sn", prefix);
661 fprint(2, "%sr", prefix);
663 fprint(2, "%sw", prefix);
670 fprint(2, " -o %s", args.ofile);
671 if(args.mline != Nline)
672 fprint(2, " -l %ld", args.mline);
684 fprint(2, "sort: too many fields specified\n");
696 doargs(int argc, char *argv[])
704 for(i=1; i<argc; i++) {
709 if(c == 0) /* forced end of arg marker */
711 argv[i] = 0; /* clobber args processed */
712 if(c == '.' || (c >= '0' && c <= '9')) {
715 f = &args.field[args.nfield];
716 dofield(s, &f->end1, &f->end2, 0, 0);
723 case '-': /* end of options */
726 case 'T': /* temp directory */
730 args.tname = argv[i];
737 case 'o': /* output file */
741 args.ofile = argv[i];
748 case 'k': /* posix key (what were they thinking?) */
766 f = &args.field[args.nfield];
767 dofield(p, &f->beg1, &f->beg2, 1, 1);
768 if(f->flags & Bflag) {
773 dofield(q, &f->end1, &f->end2, 1, 0);
779 case 't': /* tab character */
783 chartorune(&args.tabchar, argv[i]);
787 s += chartorune(&args.tabchar, s);
788 if(args.tabchar == '\n') {
789 fprint(2, "aw come on, rob\n");
793 case 'c': /* check order */
796 case 'u': /* unique */
799 case 'v': /* debugging noise */
806 args.mline = atol(argv[i]);
810 args.mline = atol(s);
814 case 'M': /* month */
815 case 'b': /* skip blanks */
816 case 'd': /* directory order */
817 case 'f': /* fold case */
818 case 'g': /* floating numbers */
819 case 'i': /* ignore non-ascii */
820 case 'n': /* numbers */
821 case 'r': /* reverse */
822 case 'w': /* ignore white */
824 fprint(2, "sort: global field set after -k\n");
828 /* option m silently ignored but required by posix */
831 fprint(2, "sort: unknown option: -%C\n", c);
837 argv[i] = 0; /* clobber args processed */
839 if(c == '.' || (c >= '0' && c <= '9')) {
841 f = &args.field[args.nfield];
842 dofield(s, &f->beg1, &f->beg2, 0, 0);
843 if(f->flags & Bflag) {
850 fprint(2, "sort: unknown option: +%C\n", c);
856 for(i=0; i<=args.nfield; i++) {
860 * global options apply to fields that
864 f->flags = args.field[0].flags;
865 if(args.field[0].flags & Bflag)
871 * build buildkey specification
873 switch(f->flags & ~(Bflag|B1flag)) {
875 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
888 case Gflag|Nflag|Rflag:
898 case Dflag|Fflag|Iflag:
899 case Dflag|Fflag|Iflag|Rflag:
900 case Dflag|Fflag|Iflag|Rflag|Wflag:
901 case Dflag|Fflag|Iflag|Wflag:
902 case Dflag|Fflag|Rflag:
903 case Dflag|Fflag|Rflag|Wflag:
904 case Dflag|Fflag|Wflag:
906 case Dflag|Iflag|Rflag:
907 case Dflag|Iflag|Rflag|Wflag:
908 case Dflag|Iflag|Wflag:
910 case Dflag|Rflag|Wflag:
914 case Fflag|Iflag|Rflag:
915 case Fflag|Iflag|Rflag|Wflag:
916 case Fflag|Iflag|Wflag:
918 case Fflag|Rflag|Wflag:
922 case Iflag|Rflag|Wflag:
925 f->dokey = dokey_dfi;
934 if(args.nfile > 1 && args.cflag) {
935 fprint(2, "sort: -c can have at most one input file\n");
942 skip(uchar *l, int n1, int n2, int bflag, int endfield)
947 if(endfield && n1 < 0)
955 for(i=n1; i>0; i--) {
961 if(!(endfield && i == 1))
966 l += ln = chartorune(&r, (char*)l);
967 for(i=n1; i>0; i--) {
971 l += ln = chartorune(&r, (char*)l);
973 if(!(endfield && i == 1))
974 l += ln = chartorune(&r, (char*)l);
979 for(i=n1; i>0; i--) {
980 while(c == ' ' || c == '\t')
982 while(c != ' ' && c != '\t') {
991 while(c == ' ' || c == '\t'){
997 for(i=n2; i>0; i--) {
1005 l += chartorune(&r, (char*)l);
1011 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1015 int state, nzero, exp, expsign, rflag;
1018 kp = k->key + cl; /* skip place for sign, exponent[2] */
1020 nzero = 0; /* number of trailing zeros */
1021 exp = 0; /* value of the exponent */
1022 expsign = 0; /* sign of the exponent */
1023 dp = 0x4040; /* location of decimal point */
1024 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1032 if(c == ' ' || c == '\t') {
1040 if(c == '+' || c == '-') {
1082 state = NSzerofract;
1087 exp = exp*10 + (c - '0');
1093 if(c >= '1' && c <= '9') {
1120 exp = exp*10 + (c - '0');
1133 state = NSzerofract;
1141 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1162 kp = k->key + k->klen;
1164 kp[0] = 0x20; /* between + and - */
1168 * result has exponent
1178 * result is fixed point number
1203 kp = k->key + k->klen;
1211 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1218 rflag = f->flags&Rflag;
1232 lp += chartorune(&r, (char*)lp);
1237 if(c < nelem(f->mapto)) {
1245 for(c=11; c>=0; c--)
1246 if(memcmp(month[c], place, sizeof(place)) == 0)
1264 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1268 int c, cl, n, rflag;
1272 rflag = f->flags & Rflag;
1282 lp += chartorune(&r, (char*)lp);
1288 * do the various mappings.
1289 * the common case is handled
1290 * completely by the table.
1292 if(c != 0 && c < Runeself) {
1302 * for characters out of range,
1303 * the table does not do Rflag.
1304 * ignore is based on mapto[255]
1306 if(c != 0 && c < nelem(f->mapto)) {
1311 if(f->mapto[nelem(f->mapto)-1] == 0)
1318 n = runetochar((char*)kp, &r);
1340 dokey_r(Key *k, uchar *lp, uchar *lpe, Field*)
1369 dokey_(Key *k, uchar *lp, uchar *lpe, Field*)
1389 int ll, kl, cl, i, n;
1393 kl = 0; /* allocated length */
1394 cl = 0; /* current length */
1397 for(i=1; i<=args.nfield; i++) {
1399 if((lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0)) == nil)
1401 if((lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1)) == nil)
1408 k = erealloc(k, sizeof(Key) +
1409 (kl-1)*sizeof(k->key[0]));
1412 (*f->dokey)(k, lp, lpe, f);
1417 * global comparisons
1419 if(!(args.uflag && cl > 0)) {
1421 if(cl+(ll+4) > kl) {
1423 k = erealloc(k, sizeof(Key) +
1424 (kl-1)*sizeof(k->key[0]));
1427 (*f->dokey)(k, l->line, l->line+ll, f);
1435 if(write(2, l->line, l->llen) != l->llen)
1437 for(i=0; i<k->klen; i++) {
1438 fprint(2, " %.2x", k->key[i]);
1439 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1450 for(i=0; i<nelem(f->mapto); i++) {
1452 if(i == ' ' || i == '\t')
1454 if(i >= 'a' && i <= 'z')
1455 c = i + ('A' - 'a');
1456 if(i >= 'A' && i <= 'Z')
1462 fprint(2, " %.2x", c);
1474 for(i=0; i<nelem(f->mapto); i++) {
1476 if(f->flags & Iflag)
1477 if(c < 040 || c > 0176)
1479 if((f->flags & Wflag) && c >= 0)
1480 if(c == ' ' || c == '\t')
1482 if((f->flags & Dflag) && c >= 0)
1483 if(!(c == ' ' || c == '\t' ||
1484 (c >= 'a' && c <= 'z') ||
1485 (c >= 'A' && c <= 'Z') ||
1486 (c >= '0' && c <= '9'))) {
1487 for(j=0; latinmap[j]; j+=3)
1488 if(c == latinmap[j+0] ||
1491 if(latinmap[j] == 0)
1494 if((f->flags & Fflag) && c >= 0) {
1495 if(c >= 'a' && c <= 'z')
1497 for(j=0; latinmap[j]; j+=3)
1498 if(c == latinmap[j+0] ||
1499 c == latinmap[j+1]) {
1504 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1512 fprint(2, " %.2x", c);
1521 /* lcase ucase fold */
1570 /************** radix sort ***********/
1577 void rsort4(Key***, ulong, int);
1578 void bsort4(Key***, ulong, int);
1581 sort4(void *a, ulong n)
1584 rsort4((Key***)a, n, 0);
1586 bsort4((Key***)a, n, 0);
1590 rsort4(Key ***a, ulong n, int b)
1592 Key ***ea, ***t, ***u, **t1, **u1, *k;
1594 static long count[257];
1595 long clist[257+257], *cp, *cp1;
1599 * pass 1 over all keys,
1600 * count the number of each key[b].
1601 * find low count and high count.
1606 for(t=a; t<ea; t++) {
1624 * pass 2 over all counts,
1625 * put partition pointers in part[c].
1626 * save compacted indexes and counts
1637 for(c=lowc; c<=higc; c++,cp++) {
1650 * pass 3 over all counts.
1651 * chase lowest pointer in each partition
1652 * around a permutation until it comes
1653 * back and is stored where it started.
1654 * static array, count[], should be
1655 * reduced to zero entries except maybe
1658 for(cp1=clist+1; cp1[0]; cp1+=2) {
1680 * pass 4 over all partitions.
1686 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1697 * bubble sort to pick up
1701 bsort4(Key ***a, ulong n, int b)
1703 Key ***i, ***j, ***k, ***l, **t;
1724 n2 = ka->key[b] - kb->key[b];
1726 n2 = memcmp(ka->key+b, kb->key+b, n1);
1749 n2 = ka->key[b] - kb->key[b];
1751 n2 = memcmp(ka->key+b, kb->key+b, n1);