]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/devbridge.c
kernel: avoid selecting the boot process in killbig()
[plan9front.git] / sys / src / 9 / port / devbridge.c
1 /*
2  * IP Ethernet bridge
3  */
4 #include "u.h"
5 #include "../port/lib.h"
6 #include "mem.h"
7 #include "dat.h"
8 #include "fns.h"
9 #include "../ip/ip.h"
10 #include "../ip/ipv6.h"
11 #include "../port/netif.h"
12 #include "../port/error.h"
13
14 typedef struct Bridge   Bridge;
15 typedef struct Port     Port;
16 typedef struct Centry   Centry;
17 typedef struct Tcphdr   Tcphdr;
18
19 enum
20 {
21         Qtopdir=        1,              /* top level directory */
22
23         Qbridgedir,                     /* bridge* directory */
24         Qbctl,
25         Qstats,
26         Qcache,
27         Qlog,
28
29         Qportdir,                       /* directory for a protocol */
30         Qpctl,
31         Qlocal,
32         Qstatus,
33
34         MaxQ,
35
36         Maxbridge=      4,
37         Maxport=        128,            // power of 2
38         CacheHash=      257,            // prime
39         CacheLook=      5,              // how many cache entries to examine
40         CacheSize=      (CacheHash+CacheLook-1),
41         CacheTimeout=   5*60,           // timeout for cache entry in seconds
42         MaxMTU= IP_MAX, // allow for jumbo frames and large UDP
43
44         TcpMssMax = 1300,               // max desirable Tcp MSS value
45         TunnelMtu = 1400,
46 };
47
48 static Dirtab bridgedirtab[]={
49         "ctl",          {Qbctl},        0,      0666,
50         "stats",        {Qstats},       0,      0444,
51         "cache",        {Qcache},       0,      0444,
52         "log",          {Qlog},         0,      0666,
53 };
54
55 static Dirtab portdirtab[]={
56         "ctl",          {Qpctl},        0,      0666,
57         "local",        {Qlocal},       0,      0444,
58         "status",       {Qstatus},      0,      0444,
59 };
60
61 enum {
62         Logcache=       (1<<0),
63         Logmcast=       (1<<1),
64 };
65
66 // types of interfaces
67 enum
68 {
69         Tether,
70         Ttun,
71 };
72
73 static Logflag logflags[] =
74 {
75         { "cache",      Logcache, },
76         { "multicast",  Logmcast, },
77         { nil,          0, },
78 };
79
80 static Dirtab   *dirtab[MaxQ];
81
82 #define TYPE(x)         (((ulong)(x).path) & 0xff)
83 #define PORT(x)         ((((ulong)(x).path) >> 8)&(Maxport-1))
84 #define QID(x, y)       (((x)<<8) | (y))
85
86 struct Centry
87 {
88         uchar   d[Eaddrlen];
89         int     port;
90         long    expire;         // entry expires this many seconds after bootime
91         long    src;
92         long    dst;
93 };
94
95 struct Bridge
96 {
97         QLock;
98         int     nport;
99         Port    *port[Maxport];
100         Centry  cache[CacheSize];
101         ulong   hit;
102         ulong   miss;
103         ulong   copy;
104         long    delay0;         // constant microsecond delay per packet
105         long    delayn;         // microsecond delay per byte
106         int     tcpmss;         // modify tcpmss value
107
108         Log;
109 };
110
111 struct Port
112 {
113         Ref;
114         int     id;
115         Bridge  *bridge;
116         int     closed;
117
118         Chan    *data[2];       // channel to data
119
120         Proc    *readp;         // read proc
121         
122         // the following uniquely identifies the port
123         int     type;
124         char    name[KNAMELEN];
125         
126         // owner hash - avoids bind/unbind races
127         ulong   ownhash;
128
129         // various stats
130         int     in;             // number of packets read
131         int     inmulti;        // multicast or broadcast
132         int     inunknown;      // unknown address
133         int     out;            // number of packets read
134         int     outmulti;       // multicast or broadcast
135         int     outunknown;     // unknown address
136         int     outfrag;        // fragmented the packet
137         int     nentry;         // number of cache entries for this port
138 };
139
140 enum {
141         EOLOPT          = 0,
142         NOOPOPT         = 1,
143         MSSOPT          = 2,
144         MSS_LENGTH      = 4,            /* Mean segment size */
145         SYN             = 0x02,         /* Pkt. is synchronise */
146         TCPHDR          = 20,
147 };
148
149 struct Tcphdr
150 {
151         uchar   sport[2];
152         uchar   dport[2];
153         uchar   seq[4];
154         uchar   ack[4];
155         uchar   flag[2];
156         uchar   win[2];
157         uchar   cksum[2];
158         uchar   urg[2];
159 };
160
161 static Bridge bridgetab[Maxbridge];
162
163 static int m2p[] = {
164         [OREAD]         4,
165         [OWRITE]        2,
166         [ORDWR]         6
167 };
168
169 static int      bridgegen(Chan *c, char*, Dirtab*, int, int s, Dir *dp);
170 static void     portbind(Bridge *b, int argc, char *argv[]);
171 static void     portunbind(Bridge *b, int argc, char *argv[]);
172 static void     etherread(void *a);
173 static char     *cachedump(Bridge *b);
174 static void     portfree(Port *port);
175 static void     cacheflushport(Bridge *b, int port);
176 static void     etherwrite(Port *port, Block *bp);
177
178 static void
179 bridgeinit(void)
180 {
181         int i;
182         Dirtab *dt;
183
184         // setup dirtab with non directory entries
185         for(i=0; i<nelem(bridgedirtab); i++) {
186                 dt = bridgedirtab + i;
187                 dirtab[TYPE(dt->qid)] = dt;
188         }
189         for(i=0; i<nelem(portdirtab); i++) {
190                 dt = portdirtab + i;
191                 dirtab[TYPE(dt->qid)] = dt;
192         }
193 }
194
195 static Chan*
196 bridgeattach(char *spec)
197 {
198         Chan *c;
199         ulong dev;
200
201         dev = strtoul(spec, nil, 10);
202         if(dev >= Maxbridge)
203                 error(Enodev);
204
205         c = devattach('B', spec);
206         mkqid(&c->qid, QID(0, Qtopdir), 0, QTDIR);
207         c->dev = dev;
208         return c;
209 }
210
211 static Walkqid*
212 bridgewalk(Chan *c, Chan *nc, char **name, int nname)
213 {
214         return devwalk(c, nc, name, nname, (Dirtab*)0, 0, bridgegen);
215 }
216
217 static int
218 bridgestat(Chan* c, uchar* db, int n)
219 {
220         return devstat(c, db, n, (Dirtab *)0, 0L, bridgegen);
221 }
222
223 static Chan*
224 bridgeopen(Chan* c, int omode)
225 {
226         int perm;
227         Bridge *b;
228
229         omode &= 3;
230         perm = m2p[omode];
231         USED(perm);
232
233         b = bridgetab + c->dev;
234         USED(b);
235
236         switch(TYPE(c->qid)) {
237         default:
238                 break;
239         case Qlog:
240                 logopen(b);
241                 break;
242         case Qcache:
243                 c->aux = cachedump(b);
244                 break;
245         }
246         c->mode = openmode(omode);
247         c->flag |= COPEN;
248         c->offset = 0;
249         return c;
250 }
251
252 static void
253 bridgeclose(Chan* c)
254 {
255         Bridge *b  = bridgetab + c->dev;
256
257         switch(TYPE(c->qid)) {
258         case Qcache:
259                 if(c->flag & COPEN)
260                         free(c->aux);
261                 break;
262         case Qlog:
263                 if(c->flag & COPEN)
264                         logclose(b);
265                 break;
266         }
267 }
268
269 static long
270 bridgeread(Chan *c, void *a, long n, vlong off)
271 {
272         char buf[256];
273         Bridge *b = bridgetab + c->dev;
274         Port *port;
275         int i, ingood, outgood;
276
277         USED(off);
278         switch(TYPE(c->qid)) {
279         default:
280                 error(Egreg);
281         case Qtopdir:
282         case Qbridgedir:
283         case Qportdir:
284                 return devdirread(c, a, n, 0, 0, bridgegen);
285         case Qlog:
286                 return logread(b, a, off, n);
287         case Qlocal:
288                 return 0;       /* TO DO */
289         case Qstatus:
290                 qlock(b);
291                 if(waserror()){
292                         qunlock(b);
293                         nexterror();
294                 }
295                 port = b->port[PORT(c->qid)];
296                 if(port == 0)
297                         strcpy(buf, "unbound\n");
298                 else {
299                         i = 0;
300                         switch(port->type) {
301                         default:
302                                 panic("bridgeread: unknown port type: %d",
303                                         port->type);
304                         case Tether:
305                                 i += snprint(buf+i, sizeof(buf)-i, "ether %s: ", port->name);
306                                 break;
307                         case Ttun:
308                                 i += snprint(buf+i, sizeof(buf)-i, "tunnel %s: ", port->name);
309                                 break;
310                         }
311                         ingood = port->in - port->inmulti - port->inunknown;
312                         outgood = port->out - port->outmulti - port->outunknown;
313                         snprint(buf+i, sizeof(buf)-i,
314                                 "in=%d(%d:%d:%d) out=%d(%d:%d:%d:%d)\n",
315                                 port->in, ingood, port->inmulti, port->inunknown,
316                                 port->out, outgood, port->outmulti,
317                                 port->outunknown, port->outfrag);
318                 }
319                 poperror();
320                 qunlock(b);
321                 return readstr(off, a, n, buf);
322         case Qbctl:
323                 snprint(buf, sizeof(buf), "%s tcpmss\ndelay %ld %ld\n",
324                         b->tcpmss ? "set" : "clear", b->delay0, b->delayn);
325                 n = readstr(off, a, n, buf);
326                 return n;
327         case Qcache:
328                 n = readstr(off, a, n, c->aux);
329                 return n;
330         case Qstats:
331                 snprint(buf, sizeof(buf), "hit=%uld miss=%uld copy=%uld\n",
332                         b->hit, b->miss, b->copy);
333                 n = readstr(off, a, n, buf);
334                 return n;
335         }
336 }
337
338 static void
339 bridgeoption(Bridge *b, char *option, int value)
340 {
341         if(strcmp(option, "tcpmss") == 0)
342                 b->tcpmss = value;
343         else
344                 error("unknown bridge option");
345 }
346
347
348 static long
349 bridgewrite(Chan *c, void *a, long n, vlong off)
350 {
351         Bridge *b = bridgetab + c->dev;
352         Cmdbuf *cb;
353         char *arg0, *p;
354         
355         USED(off);
356         switch(TYPE(c->qid)) {
357         default:
358                 error(Eperm);
359         case Qbctl:
360                 cb = parsecmd(a, n);
361                 qlock(b);
362                 if(waserror()) {
363                         qunlock(b);
364                         free(cb);
365                         nexterror();
366                 }
367                 if(cb->nf == 0)
368                         error("short write");
369                 arg0 = cb->f[0];
370                 if(strcmp(arg0, "bind") == 0) {
371                         portbind(b, cb->nf-1, cb->f+1);
372                 } else if(strcmp(arg0, "unbind") == 0) {
373                         portunbind(b, cb->nf-1, cb->f+1);
374                 } else if(strcmp(arg0, "cacheflush") == 0) {
375                         log(b, Logcache, "cache flush\n");
376                         memset(b->cache, 0, CacheSize*sizeof(Centry));
377                 } else if(strcmp(arg0, "set") == 0) {
378                         if(cb->nf != 2)
379                                 error("usage: set option");
380                         bridgeoption(b, cb->f[1], 1);
381                 } else if(strcmp(arg0, "clear") == 0) {
382                         if(cb->nf != 2)
383                                 error("usage: clear option");
384                         bridgeoption(b, cb->f[1], 0);
385                 } else if(strcmp(arg0, "delay") == 0) {
386                         if(cb->nf != 3)
387                                 error("usage: delay delay0 delayn");
388                         b->delay0 = strtol(cb->f[1], nil, 10);
389                         b->delayn = strtol(cb->f[2], nil, 10);
390                 } else
391                         error("unknown control request");
392                 poperror();
393                 qunlock(b);
394                 free(cb);
395                 return n;
396         case Qlog:
397                 cb = parsecmd(a, n);
398                 p = logctl(b, cb->nf, cb->f, logflags);
399                 free(cb);
400                 if(p != nil)
401                         error(p);
402                 return n;
403         }
404 }
405
406 static int
407 bridgegen(Chan *c, char *, Dirtab*, int, int s, Dir *dp)
408 {
409         Bridge *b = bridgetab + c->dev;
410         int type = TYPE(c->qid);
411         Dirtab *dt;
412         Qid qid;
413
414         if(s  == DEVDOTDOT){
415                 switch(TYPE(c->qid)){
416                 case Qtopdir:
417                 case Qbridgedir:
418                         snprint(up->genbuf, sizeof(up->genbuf), "#B%ld", c->dev);
419                         mkqid(&qid, Qtopdir, 0, QTDIR);
420                         devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
421                         break;
422                 case Qportdir:
423                         snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
424                         mkqid(&qid, Qbridgedir, 0, QTDIR);
425                         devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
426                         break;
427                 default:
428                         panic("bridgewalk %llux", c->qid.path);
429                 }
430                 return 1;
431         }
432
433         switch(type) {
434         default:
435                 /* non-directory entries end up here */
436                 if(c->qid.type & QTDIR)
437                         panic("bridgegen: unexpected directory");       
438                 if(s != 0)
439                         return -1;
440                 dt = dirtab[TYPE(c->qid)];
441                 if(dt == nil)
442                         panic("bridgegen: unknown type: %lud", TYPE(c->qid));
443                 devdir(c, c->qid, dt->name, dt->length, eve, dt->perm, dp);
444                 return 1;
445         case Qtopdir:
446                 if(s != 0)
447                         return -1;
448                 snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
449                 mkqid(&qid, QID(0, Qbridgedir), 0, QTDIR);
450                 devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
451                 return 1;
452         case Qbridgedir:
453                 if(s<nelem(bridgedirtab)) {
454                         dt = bridgedirtab+s;
455                         devdir(c, dt->qid, dt->name, dt->length, eve, dt->perm, dp);
456                         return 1;
457                 }
458                 s -= nelem(bridgedirtab);
459                 if(s >= b->nport)
460                         return -1;
461                 mkqid(&qid, QID(s, Qportdir), 0, QTDIR);
462                 snprint(up->genbuf, sizeof(up->genbuf), "%d", s);
463                 devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
464                 return 1;
465         case Qportdir:
466                 if(s>=nelem(portdirtab))
467                         return -1;
468                 dt = portdirtab+s;
469                 mkqid(&qid, QID(PORT(c->qid),TYPE(dt->qid)), 0, QTFILE);
470                 devdir(c, qid, dt->name, dt->length, eve, dt->perm, dp);
471                 return 1;
472         }
473 }
474
475 // parse mac address; also in netif.c
476 static int
477 parseaddr(uchar *to, char *from, int alen)
478 {
479         char nip[4];
480         char *p;
481         int i;
482
483         p = from;
484         for(i = 0; i < alen; i++){
485                 if(*p == 0)
486                         return -1;
487                 nip[0] = *p++;
488                 if(*p == 0)
489                         return -1;
490                 nip[1] = *p++;
491                 nip[2] = 0;
492                 to[i] = strtoul(nip, 0, 16);
493                 if(*p == ':')
494                         p++;
495         }
496         return 0;
497 }
498
499 // assumes b is locked
500 static void
501 portbind(Bridge *b, int argc, char *argv[])
502 {
503         Port *port;
504         Chan *ctl;
505         int type = 0, i, n;
506         ulong ownhash;
507         char *dev, *dev2 = nil;
508         char buf[100], name[KNAMELEN], path[8*KNAMELEN];
509         static char usage[] = "usage: bind ether|tunnel name ownhash dev [dev2]";
510
511         memset(name, 0, KNAMELEN);
512         if(argc < 4)
513                 error(usage);
514         if(strcmp(argv[0], "ether") == 0) {
515                 if(argc != 4)
516                         error(usage);
517                 type = Tether;
518                 strncpy(name, argv[1], KNAMELEN);
519                 name[KNAMELEN-1] = 0;
520 //              parseaddr(addr, argv[1], Eaddrlen);
521         } else if(strcmp(argv[0], "tunnel") == 0) {
522                 if(argc != 5)
523                         error(usage);
524                 type = Ttun;
525                 strncpy(name, argv[1], KNAMELEN);
526                 name[KNAMELEN-1] = 0;
527 //              parseip(addr, argv[1]);
528                 dev2 = argv[4];
529         } else
530                 error(usage);
531         ownhash = atoi(argv[2]);
532         dev = argv[3];
533         for(i=0; i<b->nport; i++) {
534                 port = b->port[i];
535                 if(port != nil && port->type == type &&
536                     memcmp(port->name, name, KNAMELEN) == 0)
537                         error("port in use");
538         }
539         for(i=0; i<Maxport; i++)
540                 if(b->port[i] == nil)
541                         break;
542         if(i == Maxport)
543                 error("no more ports");
544         port = smalloc(sizeof(Port));
545         port->ref = 1;
546         port->id = i;
547         port->ownhash = ownhash;
548
549         if(waserror()) {
550                 portfree(port);
551                 nexterror();
552         }
553         port->type = type;
554         memmove(port->name, name, KNAMELEN);
555         switch(port->type) {
556         default:
557                 panic("portbind: unknown port type: %d", type);
558         case Tether:
559                 snprint(path, sizeof(path), "%s/clone", dev);
560                 ctl = namec(path, Aopen, ORDWR, 0);
561                 if(waserror()) {
562                         cclose(ctl);
563                         nexterror();
564                 }
565                 // check addr?
566
567                 // get directory name
568                 n = devtab[ctl->type]->read(ctl, buf, sizeof(buf)-1, 0);
569                 buf[n] = 0;
570                 snprint(path, sizeof(path), "%s/%lud/data", dev, strtoul(buf, 0, 0));
571
572                 // setup connection to be promiscuous
573                 snprint(buf, sizeof(buf), "connect -1");
574                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
575                 snprint(buf, sizeof(buf), "nonblocking");
576                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
577                 snprint(buf, sizeof(buf), "promiscuous");
578                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
579                 snprint(buf, sizeof(buf), "bridge");
580                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
581
582                 // open data port
583                 port->data[0] = namec(path, Aopen, ORDWR, 0);
584                 // dup it
585                 incref(port->data[0]);
586                 port->data[1] = port->data[0];
587
588                 poperror();
589                 cclose(ctl);            
590
591                 break;
592         case Ttun:
593                 port->data[0] = namec(dev, Aopen, OREAD, 0);
594                 port->data[1] = namec(dev2, Aopen, OWRITE, 0);
595                 break;
596         }
597
598         poperror();
599
600         /* committed to binding port */
601         b->port[port->id] = port;
602         port->bridge = b;
603         if(b->nport <= port->id)
604                 b->nport = port->id+1;
605
606         // assumes kproc always succeeds
607         incref(port);
608         snprint(buf, sizeof(buf), "bridge:%s", dev);
609         kproc(buf, etherread, port);
610 }
611
612 // assumes b is locked
613 static void
614 portunbind(Bridge *b, int argc, char *argv[])
615 {
616         int type = 0, i;
617         char name[KNAMELEN];
618         ulong ownhash;
619         Port *port = nil;
620         static char usage[] = "usage: unbind ether|tunnel addr [ownhash]";
621
622         memset(name, 0, KNAMELEN);
623         if(argc < 2 || argc > 3)
624                 error(usage);
625         if(strcmp(argv[0], "ether") == 0) {
626                 type = Tether;
627                 strncpy(name, argv[1], KNAMELEN);
628                 name[KNAMELEN-1] = 0;
629 //              parseaddr(addr, argv[1], Eaddrlen);
630         } else if(strcmp(argv[0], "tunnel") == 0) {
631                 type = Ttun;
632                 strncpy(name, argv[1], KNAMELEN);
633                 name[KNAMELEN-1] = 0;
634 //              parseip(addr, argv[1]);
635         } else
636                 error(usage);
637         if(argc == 3)
638                 ownhash = atoi(argv[2]);
639         else
640                 ownhash = 0;
641         for(i=0; i<b->nport; i++) {
642                 port = b->port[i];
643                 if(port != nil && port->type == type &&
644                     memcmp(port->name, name, KNAMELEN) == 0)
645                         break;
646         }
647         if(i == b->nport)
648                 error("port not found");
649         if(ownhash != 0 && port->ownhash != 0 && ownhash != port->ownhash)
650                 error("bad owner hash");
651
652         port->closed = 1;
653         b->port[i] = nil;       // port is now unbound
654         cacheflushport(b, i);
655
656         // try and stop reader
657         if(port->readp)
658                 postnote(port->readp, 1, "unbind", 0);
659         portfree(port);
660 }
661
662 // assumes b is locked
663 static Centry *
664 cachelookup(Bridge *b, uchar d[Eaddrlen])
665 {
666         int i;
667         uint h;
668         Centry *p;
669         long sec;
670
671         // dont cache multicast or broadcast
672         if(d[0] & 1)
673                 return 0;
674
675         h = 0;
676         for(i=0; i<Eaddrlen; i++) {
677                 h *= 7;
678                 h += d[i];
679         }
680         h %= CacheHash;
681         p = b->cache + h;
682         sec = TK2SEC(m->ticks);
683         for(i=0; i<CacheLook; i++,p++) {
684                 if(memcmp(d, p->d, Eaddrlen) == 0) {
685                         p->dst++;
686                         if(sec >= p->expire) {
687                                 log(b, Logcache, "expired cache entry: %E %d\n",
688                                         d, p->port);
689                                 return nil;
690                         }
691                         p->expire = sec + CacheTimeout;
692                         return p;
693                 }
694         }
695         log(b, Logcache, "cache miss: %E\n", d);
696         return nil;
697 }
698
699 // assumes b is locked
700 static void
701 cacheupdate(Bridge *b, uchar d[Eaddrlen], int port)
702 {
703         int i;
704         uint h;
705         Centry *p, *pp;
706         long sec;
707
708         // dont cache multicast or broadcast
709         if(d[0] & 1) {
710                 log(b, Logcache, "bad source address: %E\n", d);
711                 return;
712         }
713         
714         h = 0;
715         for(i=0; i<Eaddrlen; i++) {
716                 h *= 7;
717                 h += d[i];
718         }
719         h %= CacheHash;
720         p = b->cache + h;
721         pp = p;
722         sec = p->expire;
723
724         // look for oldest entry
725         for(i=0; i<CacheLook; i++,p++) {
726                 if(memcmp(p->d, d, Eaddrlen) == 0) {
727                         p->expire = TK2SEC(m->ticks) + CacheTimeout;
728                         if(p->port != port) {
729                                 log(b, Logcache, "NIC changed port %d->%d: %E\n",
730                                         p->port, port, d);
731                                 p->port = port;
732                         }
733                         p->src++;
734                         return;
735                 }
736                 if(p->expire < sec) {
737                         sec = p->expire;
738                         pp = p;
739                 }
740         }
741         if(pp->expire != 0)
742                 log(b, Logcache, "bumping from cache: %E %d\n", pp->d, pp->port);
743         pp->expire = TK2SEC(m->ticks) + CacheTimeout;
744         memmove(pp->d, d, Eaddrlen);
745         pp->port = port;
746         pp->src = 1;
747         pp->dst = 0;
748         log(b, Logcache, "adding to cache: %E %d\n", pp->d, pp->port);
749 }
750
751 // assumes b is locked
752 static void
753 cacheflushport(Bridge *b, int port)
754 {
755         Centry *ce;
756         int i;
757
758         ce = b->cache;
759         for(i=0; i<CacheSize; i++,ce++) {
760                 if(ce->port != port)
761                         continue;
762                 memset(ce, 0, sizeof(Centry));
763         }
764 }
765
766 static char *
767 cachedump(Bridge *b)
768 {
769         int i, n;
770         long sec, off;
771         char *buf, *p, *ep;
772         Centry *ce;
773         char c;
774
775         qlock(b);
776         if(waserror()) {
777                 qunlock(b);
778                 nexterror();
779         }
780         sec = TK2SEC(m->ticks);
781         n = 0;
782         for(i=0; i<CacheSize; i++)
783                 if(b->cache[i].expire != 0)
784                         n++;
785         
786         n *= 51;        // change if print format is changed
787         n += 10;        // some slop at the end
788         buf = malloc(n);
789         if(buf == nil)
790                 error(Enomem);
791         p = buf;
792         ep = buf + n;
793         ce = b->cache;
794         off = seconds() - sec;
795         for(i=0; i<CacheSize; i++,ce++) {
796                 if(ce->expire == 0)
797                         continue;       
798                 c = (sec < ce->expire)?'v':'e';
799                 p += snprint(p, ep-p, "%E %2d %10ld %10ld %10ld %c\n", ce->d,
800                         ce->port, ce->src, ce->dst, ce->expire+off, c);
801         }
802         *p = 0;
803         poperror();
804         qunlock(b);
805
806         return buf;
807 }
808
809
810
811 // assumes b is locked, no error return
812 static void
813 ethermultiwrite(Bridge *b, Block *bp, Port *port)
814 {
815         Port *oport;
816         Etherpkt *ep;
817         int i, mcast;
818
819         ep = (Etherpkt*)bp->rp;
820         mcast = ep->d[0] & 1;           /* multicast bit of ethernet address */
821
822         oport = nil;
823         for(i=0; i<b->nport; i++) {
824                 if(i == port->id || b->port[i] == nil)
825                         continue;
826                 /*
827                  * we need to forward multicast packets for ipv6,
828                  * so always do it.
829                  */
830                 if(mcast)
831                         b->port[i]->outmulti++;
832                 else
833                         b->port[i]->outunknown++;
834
835                 // delay one so that the last write does not copy
836                 if(oport != nil) {
837                         b->copy++;
838                         etherwrite(oport, copyblock(bp, BLEN(bp)));
839                 }
840                 oport = b->port[i];
841         }
842
843         // last write free block
844         if(oport)
845                 etherwrite(oport, bp);
846         else
847                 freeb(bp);
848 }
849
850 static void
851 tcpmsshack(Etherpkt *epkt, int n)
852 {
853         int hl, optlen;
854         Tcphdr *tcphdr;
855         ulong mss, cksum;
856         uchar *optr;
857
858         /* ignore non-ipv4 packets */
859         switch(nhgets(epkt->type)){
860         case ETIP4:
861         case ETIP6:
862                 break;
863         default:
864                 return;
865         }
866         n -= ETHERHDRSIZE;
867         if(n < 1)
868                 return;
869         switch(epkt->data[0]&0xF0){
870         case IP_VER4:
871                 hl = (epkt->data[0]&15)<<2;
872                 if(n < hl+TCPHDR || hl < IP4HDR || epkt->data[9] != TCP)
873                         return;
874                 n -= hl;
875                 tcphdr = (Tcphdr*)(epkt->data + hl);
876                 break;
877         case IP_VER6:
878                 if(n < IP6HDR+TCPHDR || epkt->data[6] != TCP)
879                         return;
880                 n -= IP6HDR;
881                 tcphdr = (Tcphdr*)(epkt->data + IP6HDR);
882                 break;
883         default:
884                 return;
885         }
886         // MSS can only appear in SYN packet
887         if(!(tcphdr->flag[1] & SYN))
888                 return;
889         hl = (tcphdr->flag[0] & 0xf0)>>2;
890         if(n < hl)
891                 return;
892
893         // check for MSS option
894         optr = (uchar*)tcphdr + TCPHDR;
895         n = hl - TCPHDR;
896         for(;;) {
897                 if(n <= 0 || *optr == EOLOPT)
898                         return;
899                 if(*optr == NOOPOPT) {
900                         n--;
901                         optr++;
902                         continue;
903                 }
904                 optlen = optr[1];
905                 if(optlen < 2 || optlen > n)
906                         return;
907                 if(*optr == MSSOPT && optlen == MSS_LENGTH)
908                         break;
909                 n -= optlen;
910                 optr += optlen;
911         }
912
913         mss = nhgets(optr+2);
914         if(mss <= TcpMssMax)
915                 return;
916
917         // fit checksum
918         cksum = nhgets(tcphdr->cksum);
919         if(optr-(uchar*)tcphdr & 1) {
920 // print("tcpmsshack: odd alignment!\n");
921                 // odd alignments are a pain
922                 cksum += nhgets(optr+1);
923                 cksum -= (optr[1]<<8)|(TcpMssMax>>8);
924                 cksum += (cksum>>16);
925                 cksum &= 0xffff;
926                 cksum += nhgets(optr+3);
927                 cksum -= ((TcpMssMax&0xff)<<8)|optr[4];
928                 cksum += (cksum>>16);
929         } else {
930                 cksum += mss;
931                 cksum -= TcpMssMax;
932                 cksum += (cksum>>16);
933         }
934         hnputs(tcphdr->cksum, cksum);
935         hnputs(optr+2, TcpMssMax);
936 }
937
938 /*
939  *  process to read from the ethernet
940  */
941 static void
942 etherread(void *a)
943 {
944         Port *port = a;
945         Bridge *b = port->bridge;
946         Block *bp;
947         Etherpkt *ep;
948         Centry *ce;
949         long md, n;
950         
951         qlock(b);
952         port->readp = up;       /* hide identity under a rock for unbind */
953
954         while(!port->closed){
955                 // release lock to read - error means it is time to quit
956                 qunlock(b);
957                 if(waserror()) {
958                         print("etherread read error: %s\n", up->errstr);
959                         qlock(b);
960                         break;
961                 }
962                 bp = devtab[port->data[0]->type]->bread(port->data[0], MaxMTU, 0);
963                 poperror();
964                 qlock(b);
965                 if(bp == nil)
966                         break;
967                 n = BLEN(bp);
968                 if(port->closed || n < ETHERHDRSIZE){
969                         freeb(bp);
970                         continue;
971                 }
972                 if(waserror()) {
973 //                      print("etherread bridge error\n");
974                         freeb(bp);
975                         continue;
976                 }
977                 port->in++;
978
979                 ep = (Etherpkt*)bp->rp;
980                 cacheupdate(b, ep->s, port->id);
981                 if(b->tcpmss)
982                         tcpmsshack(ep, n);
983
984                 /*
985                  * delay packets to simulate a slow link
986                  */
987                 if(b->delay0 != 0 || b->delayn != 0){
988                         md = b->delay0 + b->delayn * n;
989                         if(md > 0)
990                                 microdelay(md);
991                 }
992
993                 poperror();     /* must now dispose of bp */
994
995                 if(ep->d[0] & 1) {
996                         log(b, Logmcast, "multicast: port=%d src=%E dst=%E type=%#.4ux\n",
997                                 port->id, ep->s, ep->d, ep->type[0]<<8|ep->type[1]);
998                         port->inmulti++;
999                         ethermultiwrite(b, bp, port);
1000                 } else {
1001                         ce = cachelookup(b, ep->d);
1002                         if(ce == nil) {
1003                                 b->miss++;
1004                                 port->inunknown++;
1005                                 ethermultiwrite(b, bp, port);
1006                         }else if(ce->port != port->id){
1007                                 b->hit++;
1008                                 etherwrite(b->port[ce->port], bp);
1009                         }else
1010                                 freeb(bp);
1011                 }
1012         }
1013 //      print("etherread: trying to exit\n");
1014         port->readp = nil;
1015         portfree(port);
1016         qunlock(b);
1017         pexit("hangup", 1);
1018 }
1019
1020 static int
1021 fragment(Etherpkt *epkt, int n)
1022 {
1023         Ip4hdr *iphdr;
1024
1025         if(n <= TunnelMtu)
1026                 return 0;
1027
1028         /* ignore non-ipv4 packets */
1029         if(nhgets(epkt->type) != ETIP4)
1030                 return 0;
1031         iphdr = (Ip4hdr*)(epkt->data);
1032         n -= ETHERHDRSIZE;
1033         /*
1034          * ignore: IP runt packets, bad packets (I don't handle IP
1035          * options for the moment), packets with don't-fragment set,
1036          * and short blocks.
1037          */
1038         if(n < IP4HDR || iphdr->vihl != (IP_VER4|IP_HLEN4) ||
1039             iphdr->frag[0] & (IP_DF>>8) || nhgets(iphdr->length) > n)
1040                 return 0;
1041
1042         return 1;
1043 }
1044
1045 static void
1046 etherwrite(Port *port, Block *bp)
1047 {
1048         Ip4hdr *eh, *feh;
1049         Etherpkt *epkt;
1050         int n, lid, len, seglen, dlen, blklen, mf;
1051         Block *nb;
1052         ushort fragoff, frag;
1053
1054         port->out++;
1055         n = BLEN(bp);
1056         epkt = (Etherpkt*)bp->rp;
1057         if(port->type != Ttun || !fragment(epkt, n)) {
1058                 if(!waserror()){
1059                         /* don't generate small packets */
1060                         if(n < ETHERMINTU)
1061                                 bp = adjustblock(bp, ETHERMINTU);
1062                         devtab[port->data[1]->type]->bwrite(port->data[1], bp, 0);
1063                         poperror();
1064                 }
1065                 return;
1066         }
1067         port->outfrag++;
1068         if(waserror()){
1069                 freeb(bp);      
1070                 return;
1071         }
1072
1073         seglen = (TunnelMtu - ETHERHDRSIZE - IP4HDR) & ~7;
1074         eh = (Ip4hdr*)(epkt->data);
1075         len = nhgets(eh->length);
1076         frag = nhgets(eh->frag);
1077         mf = frag & IP_MF;
1078         frag <<= 3;
1079         dlen = len - IP4HDR;
1080         lid = nhgets(eh->id);
1081         bp->rp += ETHERHDRSIZE+IP4HDR;
1082         
1083         if(0)
1084                 print("seglen=%d, dlen=%d, mf=%x, frag=%d\n",
1085                         seglen, dlen, mf, frag);
1086         for(fragoff = 0; fragoff < dlen; fragoff += seglen) {
1087                 nb = allocb(ETHERHDRSIZE+IP4HDR+seglen);
1088                 
1089                 feh = (Ip4hdr*)(nb->wp+ETHERHDRSIZE);
1090
1091                 memmove(nb->wp, epkt, ETHERHDRSIZE+IP4HDR);
1092                 nb->wp += ETHERHDRSIZE+IP4HDR;
1093
1094                 if((fragoff + seglen) >= dlen) {
1095                         seglen = dlen - fragoff;
1096                         hnputs(feh->frag, (frag+fragoff)>>3 | mf);
1097                 }
1098                 else    
1099                         hnputs(feh->frag, (frag+fragoff>>3) | IP_MF);
1100
1101                 hnputs(feh->length, seglen + IP4HDR);
1102                 hnputs(feh->id, lid);
1103
1104                 if(seglen){
1105                         blklen = BLEN(bp);
1106                         if(seglen < blklen)
1107                                 blklen = seglen;
1108                         memmove(nb->wp, bp->rp, blklen);
1109                         nb->wp += blklen;
1110                         bp->rp += blklen;
1111                 }
1112
1113                 feh->cksum[0] = 0;
1114                 feh->cksum[1] = 0;
1115                 hnputs(feh->cksum, ipcsum(&feh->vihl));
1116         
1117                 /* don't generate small packets */
1118                 if(BLEN(nb) < ETHERMINTU)
1119                         nb = adjustblock(nb, ETHERMINTU);
1120                 devtab[port->data[1]->type]->bwrite(port->data[1], nb, 0);
1121         }
1122         poperror();
1123         freeb(bp);      
1124 }
1125
1126 // hold b lock
1127 static void
1128 portfree(Port *port)
1129 {
1130         if(decref(port) != 0)
1131                 return;
1132
1133         if(port->data[0])
1134                 cclose(port->data[0]);
1135         if(port->data[1])
1136                 cclose(port->data[1]);
1137         memset(port, 0, sizeof(Port));
1138         free(port);
1139 }
1140
1141 Dev bridgedevtab = {
1142         'B',
1143         "bridge",
1144
1145         devreset,
1146         bridgeinit,
1147         devshutdown,
1148         bridgeattach,
1149         bridgewalk,
1150         bridgestat,
1151         bridgeopen,
1152         devcreate,
1153         bridgeclose,
1154         bridgeread,
1155         devbread,
1156         bridgewrite,
1157         devbwrite,
1158         devremove,
1159         devwstat,
1160 };