7 00/ff for end of file can conflict with 00/ff characters
12 Nline = 100000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
16 Bflag = 1<<0, /* flags per field */
28 NSstart = 0, /* states for number to key decoding */
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
95 long mline; /* max lines per file */
98 extern int latinmap[];
99 extern Rune* month[12];
101 void buildkey(Line*);
102 void doargs(int, char*[]);
103 void dofield(char*, int*, int*, int, int);
104 void dofile(Biobuf*);
105 void dokey_(Key*, uchar*, uchar*, Field*);
106 void dokey_dfi(Key*, uchar*, uchar*, Field*);
107 void dokey_gn(Key*, uchar*, uchar*, Field*);
108 void dokey_m(Key*, uchar*, uchar*, Field*);
109 void dokey_r(Key*, uchar*, uchar*, Field*);
111 void* emalloc(ulong);
112 void* erealloc(void*, ulong);
113 int kcmp(Key*, Key*);
114 void makemapd(Field*);
115 void makemapm(Field*);
116 void mergefiles(int, int, Biobuf*);
117 void mergeout(Biobuf*);
119 Line* newline(Biobuf*);
120 void notifyf(void*, char*);
121 void printargs(void);
122 void printout(Biobuf*);
123 void setfield(int, int);
124 uchar* skip(uchar*, int, int, int, int);
125 void sort4(void*, ulong);
128 void lineout(Biobuf*, Line*);
131 main(int argc, char *argv[])
137 notify(notifyf); /**/
142 for(i=1; i<argc; i++) {
143 if((s = argv[i]) == nil)
145 if(strcmp(s, "-") == 0) {
146 Binit(&bbuf, 0, OREAD);
153 fprint(2, "sort: open %s: %r\n", s);
156 Binit(&bbuf, f, OREAD);
161 if(args.nfile == 0) {
162 Binit(&bbuf, 0, OREAD);
169 fprint(2, "=========\n");
173 f = create(args.ofile, OWRITE, 0666);
175 fprint(2, "sort: create %s: %r\n", args.ofile);
180 Binit(&bbuf, f, OWRITE);
198 if((ol = newline(b)) == nil)
201 if((l = newline(b)) == nil)
203 n = kcmp(ol->key, l->key);
204 if(n > 0 || (n == 0 && args.uflag)) {
205 fprint(2, "sort: -c file not in sort\n"); /**/
217 if(args.linep == nil)
218 args.linep = emalloc(args.mline * sizeof(args.linep));
220 if((l = newline(b)) == nil)
222 if(args.nline >= args.mline)
224 args.linep[args.nline] = l;
231 notifyf(void*, char *s)
234 if(strcmp(s, "interrupt") == 0)
236 if(strcmp(s, "hangup") == 0)
238 if(strcmp(s, "kill") == 0)
240 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
242 fprint(2, "sort: note: %s\n", s);
253 p = Brdline(b, '\n');
261 l = erealloc(l, sizeof(Line) +
262 (n+31)*sizeof(l->line[0]));
265 fprint(2, "sort: newline added\n");
269 sysfatal("bug: l == nil");
278 l = emalloc(sizeof(Line) + (n-1)*sizeof(l->line[0]));
280 memmove(l->line, p, n);
286 lineout(Biobuf *b, Line *l)
291 m = Bwrite(b, l->line, n);
305 sort4(args.linep, args.nline);
306 tf = tempfile(args.ntemp);
308 f = create(tf, OWRITE, 0666);
310 fprint(2, "sort: create %s: %r\n", tf);
314 Binit(&tb, f, OWRITE);
316 for(n=args.nline; n>0; n--) {
332 for(i=0; i<args.ntemp; i++)
338 erealloc(void *v, ulong n)
340 if((v = realloc(v, n)) == nil && n != 0){
341 fprint(2, "realloc: %r\n");
353 if((v = malloc(n)) == nil){
354 fprint(2, "malloc: %r\n");
365 static char file[100];
372 if(strlen(dir) >= nelem(file)-20) {
373 fprint(2, "temp file directory name is too long: %s\n", dir);
386 snprint(file, sizeof(file), "%s/sort.%.4d.%.4d", dir, pid%10000, n);
397 for(i=0; i<args.ntemp; i+=n) {
400 tf = tempfile(args.ntemp);
402 f = create(tf, OWRITE, 0666);
404 fprint(2, "sort: create %s: %r\n", tf);
407 Binit(&tb, f, OWRITE);
410 mergefiles(i, n, &tb);
420 mergefiles(int t, int n, Biobuf *b)
422 Merge *m, *mp, **mmp;
428 mmp = emalloc(n*sizeof(*mmp));
429 mp = emalloc(n*sizeof(*mp));
433 for(i=0; i<n; i++,m++) {
437 fprint(2, "sort: reopen %s: %r\n", tf);
441 Binit(&m->b, f, OREAD);
444 if((l = newline(&m->b)) == nil)
459 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
478 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
486 for(i=0; i<n; i++,m++) {
496 kcmp(Key *ka, Key *kb)
501 * set n to length of smaller key
507 return memcmp(ka->key, kb->key, n);
517 sort4(args.linep, args.nline);
520 for(n=args.nline; n>0; n--) {
522 if(args.uflag && ok && kcmp(ok, l->key) == 0)
530 setfield(int n, int c)
537 fprint(2, "sort: unknown option: field.%C\n", c);
539 case 'b': /* skip blanks */
542 case 'd': /* directory order */
545 case 'f': /* fold case */
548 case 'g': /* floating point -n case */
551 case 'i': /* ignore non-ascii */
554 case 'M': /* month */
557 case 'n': /* numbers */
560 case 'r': /* reverse */
563 case 'w': /* ignore white */
570 dofield(char *s, int *n1, int *n2, int off1, int off2)
575 if(c >= '0' && c <= '9') {
577 while(c >= '0' && c <= '9') {
581 n -= off1; /* posix committee: rot in hell */
583 fprint(2, "sort: field offset must be positive\n");
590 if(c >= '0' && c <= '9') {
592 while(c >= '0' && c <= '9') {
598 fprint(2, "sort: character offset must be positive\n");
605 setfield(args.nfield, c);
618 for(i=0; i<=args.nfield; i++) {
624 fprint(2, " +%d", n);
633 if(f->flags & B1flag)
638 fprint(2, " -%d", n);
649 fprint(2, "%sb", prefix);
651 fprint(2, "%sd", prefix);
653 fprint(2, "%sf", prefix);
655 fprint(2, "%sg", prefix);
657 fprint(2, "%si", prefix);
659 fprint(2, "%sM", prefix);
661 fprint(2, "%sn", prefix);
663 fprint(2, "%sr", prefix);
665 fprint(2, "%sw", prefix);
672 fprint(2, " -o %s", args.ofile);
673 if(args.mline != Nline)
674 fprint(2, " -l %ld", args.mline);
686 fprint(2, "sort: too many fields specified\n");
698 doargs(int argc, char *argv[])
706 for(i=1; i<argc; i++) {
711 if(c == 0) /* forced end of arg marker */
713 argv[i] = 0; /* clobber args processed */
714 if(c == '.' || (c >= '0' && c <= '9')) {
717 f = &args.field[args.nfield];
718 dofield(s, &f->end1, &f->end2, 0, 0);
725 case '-': /* end of options */
728 case 'T': /* temp directory */
732 args.tname = argv[i];
739 case 'o': /* output file */
743 args.ofile = argv[i];
750 case 'k': /* posix key (what were they thinking?) */
768 f = &args.field[args.nfield];
769 dofield(p, &f->beg1, &f->beg2, 1, 1);
770 if(f->flags & Bflag) {
775 dofield(q, &f->end1, &f->end2, 1, 0);
781 case 't': /* tab character */
785 chartorune(&args.tabchar, argv[i]);
789 s += chartorune(&args.tabchar, s);
790 if(args.tabchar == '\n') {
791 fprint(2, "aw come on, rob\n");
795 case 'c': /* check order */
798 case 'u': /* unique */
801 case 'v': /* debugging noise */
808 args.mline = atol(argv[i]);
812 args.mline = atol(s);
816 case 'M': /* month */
817 case 'b': /* skip blanks */
818 case 'd': /* directory order */
819 case 'f': /* fold case */
820 case 'g': /* floating numbers */
821 case 'i': /* ignore non-ascii */
822 case 'n': /* numbers */
823 case 'r': /* reverse */
824 case 'w': /* ignore white */
826 fprint(2, "sort: global field set after -k\n");
830 /* option m silently ignored but required by posix */
833 fprint(2, "sort: unknown option: -%C\n", c);
839 argv[i] = 0; /* clobber args processed */
841 if(c == '.' || (c >= '0' && c <= '9')) {
843 f = &args.field[args.nfield];
844 dofield(s, &f->beg1, &f->beg2, 0, 0);
845 if(f->flags & Bflag) {
852 fprint(2, "sort: unknown option: +%C\n", c);
858 for(i=0; i<=args.nfield; i++) {
862 * global options apply to fields that
866 f->flags = args.field[0].flags;
867 if(args.field[0].flags & Bflag)
873 * build buildkey specification
875 switch(f->flags & ~(Bflag|B1flag)) {
877 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
890 case Gflag|Nflag|Rflag:
900 case Dflag|Fflag|Iflag:
901 case Dflag|Fflag|Iflag|Rflag:
902 case Dflag|Fflag|Iflag|Rflag|Wflag:
903 case Dflag|Fflag|Iflag|Wflag:
904 case Dflag|Fflag|Rflag:
905 case Dflag|Fflag|Rflag|Wflag:
906 case Dflag|Fflag|Wflag:
908 case Dflag|Iflag|Rflag:
909 case Dflag|Iflag|Rflag|Wflag:
910 case Dflag|Iflag|Wflag:
912 case Dflag|Rflag|Wflag:
916 case Fflag|Iflag|Rflag:
917 case Fflag|Iflag|Rflag|Wflag:
918 case Fflag|Iflag|Wflag:
920 case Fflag|Rflag|Wflag:
924 case Iflag|Rflag|Wflag:
927 f->dokey = dokey_dfi;
936 if(args.nfile > 1 && args.cflag) {
937 fprint(2, "sort: -c can have at most one input file\n");
944 skip(uchar *l, int n1, int n2, int bflag, int endfield)
949 if(endfield && n1 < 0)
957 for(i=n1; i>0; i--) {
963 if(!(endfield && i == 1))
968 l += ln = chartorune(&r, (char*)l);
969 for(i=n1; i>0; i--) {
973 l += ln = chartorune(&r, (char*)l);
975 if(!(endfield && i == 1))
976 l += ln = chartorune(&r, (char*)l);
981 for(i=n1; i>0; i--) {
982 while(c == ' ' || c == '\t')
984 while(c != ' ' && c != '\t') {
993 while(c == ' ' || c == '\t'){
999 for(i=n2; i>0; i--) {
1007 l += chartorune(&r, (char*)l);
1013 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1017 int state, nzero, exp, expsign, rflag;
1020 kp = k->key + cl; /* skip place for sign, exponent[2] */
1022 nzero = 0; /* number of trailing zeros */
1023 exp = 0; /* value of the exponent */
1024 expsign = 0; /* sign of the exponent */
1025 dp = 0x4040; /* location of decimal point */
1026 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1034 if(c == ' ' || c == '\t') {
1042 if(c == '+' || c == '-') {
1084 state = NSzerofract;
1089 exp = exp*10 + (c - '0');
1095 if(c >= '1' && c <= '9') {
1122 exp = exp*10 + (c - '0');
1135 state = NSzerofract;
1143 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1164 kp = k->key + k->klen;
1166 kp[0] = 0x20; /* between + and - */
1170 * result has exponent
1180 * result is fixed point number
1205 kp = k->key + k->klen;
1213 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1220 rflag = f->flags&Rflag;
1234 lp += chartorune(&r, (char*)lp);
1239 if(c < nelem(f->mapto)) {
1247 for(c=11; c>=0; c--)
1248 if(memcmp(month[c], place, sizeof(place)) == 0)
1266 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1270 int c, cl, n, rflag;
1274 rflag = f->flags & Rflag;
1284 lp += chartorune(&r, (char*)lp);
1290 * do the various mappings.
1291 * the common case is handled
1292 * completely by the table.
1294 if(c != 0 && c < Runeself) {
1304 * for characters out of range,
1305 * the table does not do Rflag.
1306 * ignore is based on mapto[255]
1308 if(c != 0 && c < nelem(f->mapto)) {
1313 if(f->mapto[nelem(f->mapto)-1] == 0)
1320 n = runetochar((char*)kp, &r);
1342 dokey_r(Key *k, uchar *lp, uchar *lpe, Field*)
1371 dokey_(Key *k, uchar *lp, uchar *lpe, Field*)
1391 int ll, kl, cl, i, n;
1395 kl = 0; /* allocated length */
1396 cl = 0; /* current length */
1399 for(i=1; i<=args.nfield; i++) {
1401 if((lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0)) == nil)
1403 if((lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1)) == nil)
1410 k = erealloc(k, sizeof(Key) +
1411 (kl-1)*sizeof(k->key[0]));
1414 (*f->dokey)(k, lp, lpe, f);
1419 * global comparisons
1421 if(!(args.uflag && cl > 0)) {
1423 if(cl+(ll+4) > kl) {
1425 k = erealloc(k, sizeof(Key) +
1426 (kl-1)*sizeof(k->key[0]));
1429 (*f->dokey)(k, l->line, l->line+ll, f);
1437 if(write(2, l->line, l->llen) != l->llen)
1439 for(i=0; i<k->klen; i++) {
1440 fprint(2, " %.2x", k->key[i]);
1441 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1452 for(i=0; i<nelem(f->mapto); i++) {
1454 if(i == ' ' || i == '\t')
1456 if(i >= 'a' && i <= 'z')
1457 c = i + ('A' - 'a');
1458 if(i >= 'A' && i <= 'Z')
1464 fprint(2, " %.2x", c);
1476 for(i=0; i<nelem(f->mapto); i++) {
1478 if(f->flags & Iflag)
1479 if(c < 040 || c > 0176)
1481 if((f->flags & Wflag) && c >= 0)
1482 if(c == ' ' || c == '\t')
1484 if((f->flags & Dflag) && c >= 0)
1485 if(!(c == ' ' || c == '\t' ||
1486 (c >= 'a' && c <= 'z') ||
1487 (c >= 'A' && c <= 'Z') ||
1488 (c >= '0' && c <= '9'))) {
1489 for(j=0; latinmap[j]; j+=3)
1490 if(c == latinmap[j+0] ||
1493 if(latinmap[j] == 0)
1496 if((f->flags & Fflag) && c >= 0) {
1497 if(c >= 'a' && c <= 'z')
1499 for(j=0; latinmap[j]; j+=3)
1500 if(c == latinmap[j+0] ||
1501 c == latinmap[j+1]) {
1506 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1514 fprint(2, " %.2x", c);
1523 /* lcase ucase fold */
1572 /************** radix sort ***********/
1579 void rsort4(Key***, ulong, int);
1580 void bsort4(Key***, ulong, int);
1583 sort4(void *a, ulong n)
1586 rsort4((Key***)a, n, 0);
1588 bsort4((Key***)a, n, 0);
1592 rsort4(Key ***a, ulong n, int b)
1594 Key ***ea, ***t, ***u, **t1, **u1, *k;
1596 static long count[257];
1597 long clist[257+257], *cp, *cp1;
1601 * pass 1 over all keys,
1602 * count the number of each key[b].
1603 * find low count and high count.
1608 for(t=a; t<ea; t++) {
1626 * pass 2 over all counts,
1627 * put partition pointers in part[c].
1628 * save compacted indexes and counts
1639 for(c=lowc; c<=higc; c++,cp++) {
1652 * pass 3 over all counts.
1653 * chase lowest pointer in each partition
1654 * around a permutation until it comes
1655 * back and is stored where it started.
1656 * static array, count[], should be
1657 * reduced to zero entries except maybe
1660 for(cp1=clist+1; cp1[0]; cp1+=2) {
1682 * pass 4 over all partitions.
1688 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1699 * bubble sort to pick up
1703 bsort4(Key ***a, ulong n, int b)
1705 Key ***i, ***j, ***k, ***l, **t;
1726 n2 = ka->key[b] - kb->key[b];
1728 n2 = memcmp(ka->key+b, kb->key+b, n1);
1751 n2 = ka->key[b] - kb->key[b];
1753 n2 = memcmp(ka->key+b, kb->key+b, n1);