7 00/ff for end of file can conflict with 00/ff characters
12 Nline = 100000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
16 Bflag = 1<<0, /* flags per field */
28 NSstart = 0, /* states for number to key decoding */
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
95 long mline; /* max lines per file */
98 extern int latinmap[];
99 extern Rune* month[12];
101 void buildkey(Line*);
102 void doargs(int, char*[]);
103 void dofield(char*, int*, int*, int, int);
104 void dofile(Biobuf*);
105 void dokey_(Key*, uchar*, uchar*, Field*);
106 void dokey_dfi(Key*, uchar*, uchar*, Field*);
107 void dokey_gn(Key*, uchar*, uchar*, Field*);
108 void dokey_m(Key*, uchar*, uchar*, Field*);
109 void dokey_r(Key*, uchar*, uchar*, Field*);
111 int kcmp(Key*, Key*);
112 void makemapd(Field*);
113 void makemapm(Field*);
114 void mergefiles(int, int, Biobuf*);
115 void mergeout(Biobuf*);
117 Line* newline(Biobuf*);
119 void notifyf(void*, char*);
120 void printargs(void);
121 void printout(Biobuf*);
122 void setfield(int, int);
123 uchar* skip(uchar*, int, int, int, int);
124 void sort4(void*, ulong);
127 void lineout(Biobuf*, Line*);
130 main(int argc, char *argv[])
136 notify(notifyf); /**/
141 for(i=1; i<argc; i++) {
145 if(strcmp(s, "-") == 0) {
146 Binit(&bbuf, 0, OREAD);
153 fprint(2, "sort: open %s: %r\n", s);
156 Binit(&bbuf, f, OREAD);
161 if(args.nfile == 0) {
162 Binit(&bbuf, 0, OREAD);
169 fprint(2, "=========\n");
173 f = create(args.ofile, OWRITE, 0666);
175 fprint(2, "sort: create %s: %r\n", args.ofile);
180 Binit(&bbuf, f, OWRITE);
205 n = kcmp(ol->key, l->key);
206 if(n > 0 || (n == 0 && args.uflag)) {
207 fprint(2, "sort: -c file not in sort\n"); /**/
217 if(args.linep == 0) {
218 args.linep = malloc(args.mline * sizeof(args.linep));
226 if(args.nline >= args.mline)
228 args.linep[args.nline] = l;
235 notifyf(void*, char *s)
238 if(strcmp(s, "interrupt") == 0)
240 if(strcmp(s, "hangup") == 0)
242 if(strcmp(s, "kill") == 0)
244 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
246 fprint(2, "sort: note: %s\n", s);
257 p = Brdline(b, '\n');
265 l = realloc(l, sizeof(Line) +
266 (n+31)*sizeof(l->line[0]));
272 fprint(2, "sort: newline added\n");
283 l = malloc(sizeof(Line) +
284 (n-1)*sizeof(l->line[0]));
288 memmove(l->line, p, n);
294 lineout(Biobuf *b, Line *l)
299 m = Bwrite(b, l->line, n);
313 sort4(args.linep, args.nline);
314 tf = tempfile(args.ntemp);
316 f = create(tf, OWRITE, 0666);
318 fprint(2, "sort: create %s: %r\n", tf);
322 Binit(&tb, f, OWRITE);
324 for(n=args.nline; n>0; n--) {
340 for(i=0; i<args.ntemp; i++)
348 fprint(2, "sort: out of memory\n");
355 static char file[100];
362 if(strlen(dir) >= nelem(file)-20) {
363 fprint(2, "temp file directory name is too long: %s\n", dir);
376 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);
387 for(i=0; i<args.ntemp; i+=n) {
390 tf = tempfile(args.ntemp);
392 f = create(tf, OWRITE, 0666);
394 fprint(2, "sort: create %s: %r\n", tf);
397 Binit(&tb, f, OWRITE);
400 mergefiles(i, n, &tb);
410 mergefiles(int t, int n, Biobuf *b)
412 Merge *m, *mp, **mmp;
418 mmp = malloc(n*sizeof(*mmp));
419 mp = malloc(n*sizeof(*mp));
420 if(mmp == 0 || mp == 0)
425 for(i=0; i<n; i++,m++) {
429 fprint(2, "sort: reopen %s: %r\n", tf);
433 Binit(&m->b, f, OREAD);
452 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
471 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
479 for(i=0; i<n; i++,m++) {
489 kcmp(Key *ka, Key *kb)
494 * set n to length of smaller key
500 return memcmp(ka->key, kb->key, n);
510 sort4(args.linep, args.nline);
513 for(n=args.nline; n>0; n--) {
515 if(args.uflag && ok && kcmp(ok, l->key) == 0)
523 setfield(int n, int c)
530 fprint(2, "sort: unknown option: field.%C\n", c);
532 case 'b': /* skip blanks */
535 case 'd': /* directory order */
538 case 'f': /* fold case */
541 case 'g': /* floating point -n case */
544 case 'i': /* ignore non-ascii */
547 case 'M': /* month */
550 case 'n': /* numbers */
553 case 'r': /* reverse */
556 case 'w': /* ignore white */
563 dofield(char *s, int *n1, int *n2, int off1, int off2)
568 if(c >= '0' && c <= '9') {
570 while(c >= '0' && c <= '9') {
574 n -= off1; /* posix committee: rot in hell */
576 fprint(2, "sort: field offset must be positive\n");
583 if(c >= '0' && c <= '9') {
585 while(c >= '0' && c <= '9') {
591 fprint(2, "sort: character offset must be positive\n");
598 setfield(args.nfield, c);
611 for(i=0; i<=args.nfield; i++) {
617 fprint(2, " +%d", n);
626 if(f->flags & B1flag)
631 fprint(2, " -%d", n);
642 fprint(2, "%sb", prefix);
644 fprint(2, "%sd", prefix);
646 fprint(2, "%sf", prefix);
648 fprint(2, "%sg", prefix);
650 fprint(2, "%si", prefix);
652 fprint(2, "%sM", prefix);
654 fprint(2, "%sn", prefix);
656 fprint(2, "%sr", prefix);
658 fprint(2, "%sw", prefix);
665 fprint(2, " -o %s", args.ofile);
666 if(args.mline != Nline)
667 fprint(2, " -l %ld", args.mline);
679 fprint(2, "sort: too many fields specified\n");
691 doargs(int argc, char *argv[])
699 for(i=1; i<argc; i++) {
704 if(c == 0) /* forced end of arg marker */
706 argv[i] = 0; /* clobber args processed */
707 if(c == '.' || (c >= '0' && c <= '9')) {
710 f = &args.field[args.nfield];
711 dofield(s, &f->end1, &f->end2, 0, 0);
718 case '-': /* end of options */
721 case 'T': /* temp directory */
725 args.tname = argv[i];
732 case 'o': /* output file */
736 args.ofile = argv[i];
743 case 'k': /* posix key (what were they thinking?) */
761 f = &args.field[args.nfield];
762 dofield(p, &f->beg1, &f->beg2, 1, 1);
763 if(f->flags & Bflag) {
768 dofield(q, &f->end1, &f->end2, 1, 0);
774 case 't': /* tab character */
778 chartorune(&args.tabchar, argv[i]);
782 s += chartorune(&args.tabchar, s);
783 if(args.tabchar == '\n') {
784 fprint(2, "aw come on, rob\n");
788 case 'c': /* check order */
791 case 'u': /* unique */
794 case 'v': /* debugging noise */
801 args.mline = atol(argv[i]);
805 args.mline = atol(s);
809 case 'M': /* month */
810 case 'b': /* skip blanks */
811 case 'd': /* directory order */
812 case 'f': /* fold case */
813 case 'g': /* floating numbers */
814 case 'i': /* ignore non-ascii */
815 case 'n': /* numbers */
816 case 'r': /* reverse */
817 case 'w': /* ignore white */
819 fprint(2, "sort: global field set after -k\n");
823 /* option m silently ignored but required by posix */
826 fprint(2, "sort: unknown option: -%C\n", c);
832 argv[i] = 0; /* clobber args processed */
834 if(c == '.' || (c >= '0' && c <= '9')) {
836 f = &args.field[args.nfield];
837 dofield(s, &f->beg1, &f->beg2, 0, 0);
838 if(f->flags & Bflag) {
845 fprint(2, "sort: unknown option: +%C\n", c);
851 for(i=0; i<=args.nfield; i++) {
855 * global options apply to fields that
859 f->flags = args.field[0].flags;
860 if(args.field[0].flags & Bflag)
866 * build buildkey specification
868 switch(f->flags & ~(Bflag|B1flag)) {
870 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
883 case Gflag|Nflag|Rflag:
893 case Dflag|Fflag|Iflag:
894 case Dflag|Fflag|Iflag|Rflag:
895 case Dflag|Fflag|Iflag|Rflag|Wflag:
896 case Dflag|Fflag|Iflag|Wflag:
897 case Dflag|Fflag|Rflag:
898 case Dflag|Fflag|Rflag|Wflag:
899 case Dflag|Fflag|Wflag:
901 case Dflag|Iflag|Rflag:
902 case Dflag|Iflag|Rflag|Wflag:
903 case Dflag|Iflag|Wflag:
905 case Dflag|Rflag|Wflag:
909 case Fflag|Iflag|Rflag:
910 case Fflag|Iflag|Rflag|Wflag:
911 case Fflag|Iflag|Wflag:
913 case Fflag|Rflag|Wflag:
917 case Iflag|Rflag|Wflag:
920 f->dokey = dokey_dfi;
929 if(args.nfile > 1 && args.cflag) {
930 fprint(2, "sort: -c can have at most one input file\n");
937 skip(uchar *l, int n1, int n2, int bflag, int endfield)
942 if(endfield && n1 < 0)
949 for(i=n1; i>0; i--) {
955 if(!(endfield && i == 1))
960 l += chartorune(&r, (char*)l);
961 for(i=n1; i>0; i--) {
965 l += chartorune(&r, (char*)l);
967 if(!(endfield && i == 1))
968 l += chartorune(&r, (char*)l);
973 for(i=n1; i>0; i--) {
974 while(c == ' ' || c == '\t')
976 while(c != ' ' && c != '\t') {
985 while(c == ' ' || c == '\t')
989 for(i=n2; i>0; i--) {
997 l += chartorune(&r, (char*)l);
1003 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1007 int state, nzero, exp, expsign, rflag;
1010 kp = k->key + cl; /* skip place for sign, exponent[2] */
1012 nzero = 0; /* number of trailing zeros */
1013 exp = 0; /* value of the exponent */
1014 expsign = 0; /* sign of the exponent */
1015 dp = 0x4040; /* location of decimal point */
1016 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1024 if(c == ' ' || c == '\t') {
1032 if(c == '+' || c == '-') {
1074 state = NSzerofract;
1079 exp = exp*10 + (c - '0');
1085 if(c >= '1' && c <= '9') {
1112 exp = exp*10 + (c - '0');
1125 state = NSzerofract;
1133 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1154 kp = k->key + k->klen;
1156 kp[0] = 0x20; /* between + and - */
1160 * result has exponent
1170 * result is fixed point number
1195 kp = k->key + k->klen;
1203 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1210 rflag = f->flags&Rflag;
1224 lp += chartorune(&r, (char*)lp);
1229 if(c < nelem(f->mapto)) {
1237 for(c=11; c>=0; c--)
1238 if(memcmp(month[c], place, sizeof(place)) == 0)
1256 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1260 int c, cl, n, rflag;
1264 rflag = f->flags & Rflag;
1274 lp += chartorune(&r, (char*)lp);
1280 * do the various mappings.
1281 * the common case is handled
1282 * completely by the table.
1284 if(c != 0 && c < Runeself) {
1294 * for characters out of range,
1295 * the table does not do Rflag.
1296 * ignore is based on mapto[255]
1298 if(c != 0 && c < nelem(f->mapto)) {
1303 if(f->mapto[nelem(f->mapto)-1] == 0)
1310 n = runetochar((char*)kp, &r);
1332 dokey_r(Key *k, uchar *lp, uchar *lpe, Field*)
1361 dokey_(Key *k, uchar *lp, uchar *lpe, Field*)
1381 int ll, kl, cl, i, n;
1385 kl = 0; /* allocated length */
1386 cl = 0; /* current length */
1389 for(i=1; i<=args.nfield; i++) {
1391 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);
1394 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);
1402 k = realloc(k, sizeof(Key) +
1403 (kl-1)*sizeof(k->key[0]));
1408 (*f->dokey)(k, lp, lpe, f);
1413 * global comparisons
1415 if(!(args.uflag && cl > 0)) {
1417 if(cl+(ll+4) > kl) {
1419 k = realloc(k, sizeof(Key) +
1420 (kl-1)*sizeof(k->key[0]));
1425 (*f->dokey)(k, l->line, l->line+ll, f);
1433 write(2, l->line, l->llen);
1434 for(i=0; i<k->klen; i++) {
1435 fprint(2, " %.2x", k->key[i]);
1436 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1447 for(i=0; i<nelem(f->mapto); i++) {
1449 if(i == ' ' || i == '\t')
1451 if(i >= 'a' && i <= 'z')
1452 c = i + ('A' - 'a');
1453 if(i >= 'A' && i <= 'Z')
1459 fprint(2, " %.2x", c);
1471 for(i=0; i<nelem(f->mapto); i++) {
1473 if(f->flags & Iflag)
1474 if(c < 040 || c > 0176)
1476 if((f->flags & Wflag) && c >= 0)
1477 if(c == ' ' || c == '\t')
1479 if((f->flags & Dflag) && c >= 0)
1480 if(!(c == ' ' || c == '\t' ||
1481 (c >= 'a' && c <= 'z') ||
1482 (c >= 'A' && c <= 'Z') ||
1483 (c >= '0' && c <= '9'))) {
1484 for(j=0; latinmap[j]; j+=3)
1485 if(c == latinmap[j+0] ||
1488 if(latinmap[j] == 0)
1491 if((f->flags & Fflag) && c >= 0) {
1492 if(c >= 'a' && c <= 'z')
1494 for(j=0; latinmap[j]; j+=3)
1495 if(c == latinmap[j+0] ||
1496 c == latinmap[j+1]) {
1501 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1509 fprint(2, " %.2x", c);
1518 /* lcase ucase fold */
1567 /************** radix sort ***********/
1574 void rsort4(Key***, ulong, int);
1575 void bsort4(Key***, ulong, int);
1578 sort4(void *a, ulong n)
1581 rsort4((Key***)a, n, 0);
1583 bsort4((Key***)a, n, 0);
1587 rsort4(Key ***a, ulong n, int b)
1589 Key ***ea, ***t, ***u, **t1, **u1, *k;
1591 static long count[257];
1592 long clist[257+257], *cp, *cp1;
1596 * pass 1 over all keys,
1597 * count the number of each key[b].
1598 * find low count and high count.
1603 for(t=a; t<ea; t++) {
1621 * pass 2 over all counts,
1622 * put partition pointers in part[c].
1623 * save compacted indexes and counts
1634 for(c=lowc; c<=higc; c++,cp++) {
1647 * pass 3 over all counts.
1648 * chase lowest pointer in each partition
1649 * around a permutation until it comes
1650 * back and is stored where it started.
1651 * static array, count[], should be
1652 * reduced to zero entries except maybe
1655 for(cp1=clist+1; cp1[0]; cp1+=2) {
1677 * pass 4 over all partitions.
1683 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1694 * bubble sort to pick up
1698 bsort4(Key ***a, ulong n, int b)
1700 Key ***i, ***j, ***k, ***l, **t;
1721 n2 = ka->key[b] - kb->key[b];
1723 n2 = memcmp(ka->key+b, kb->key+b, n1);
1746 n2 = ka->key[b] - kb->key[b];
1748 n2 = memcmp(ka->key+b, kb->key+b, n1);