]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/devbridge.c
devbridge: simplify etherwrite() as we dont deal with block lists
[plan9front.git] / sys / src / 9 / port / devbridge.c
1 /*
2  * IPv4 Ethernet bridge
3  */
4 #include "u.h"
5 #include "../port/lib.h"
6 #include "mem.h"
7 #include "dat.h"
8 #include "fns.h"
9 #include "../ip/ip.h"
10 #include "../port/netif.h"
11 #include "../port/error.h"
12
13 typedef struct Bridge   Bridge;
14 typedef struct Port     Port;
15 typedef struct Centry   Centry;
16 typedef struct Iphdr    Iphdr;
17 typedef struct Tcphdr   Tcphdr;
18
19 enum
20 {
21         Qtopdir=        1,              /* top level directory */
22
23         Qbridgedir,                     /* bridge* directory */
24         Qbctl,
25         Qstats,
26         Qcache,
27         Qlog,
28
29         Qportdir,                       /* directory for a protocol */
30         Qpctl,
31         Qlocal,
32         Qstatus,
33
34         MaxQ,
35
36         Maxbridge=      4,
37         Maxport=        128,            // power of 2
38         CacheHash=      257,            // prime
39         CacheLook=      5,              // how many cache entries to examine
40         CacheSize=      (CacheHash+CacheLook-1),
41         CacheTimeout=   5*60,           // timeout for cache entry in seconds
42         MaxMTU= IP_MAX, // allow for jumbo frames and large UDP
43
44         TcpMssMax = 1300,               // max desirable Tcp MSS value
45         TunnelMtu = 1400,
46 };
47
48 static Dirtab bridgedirtab[]={
49         "ctl",          {Qbctl},        0,      0666,
50         "stats",        {Qstats},       0,      0444,
51         "cache",        {Qcache},       0,      0444,
52         "log",          {Qlog},         0,      0666,
53 };
54
55 static Dirtab portdirtab[]={
56         "ctl",          {Qpctl},        0,      0666,
57         "local",        {Qlocal},       0,      0444,
58         "status",       {Qstatus},      0,      0444,
59 };
60
61 enum {
62         Logcache=       (1<<0),
63         Logmcast=       (1<<1),
64 };
65
66 // types of interfaces
67 enum
68 {
69         Tether,
70         Ttun,
71 };
72
73 static Logflag logflags[] =
74 {
75         { "cache",      Logcache, },
76         { "multicast",  Logmcast, },
77         { nil,          0, },
78 };
79
80 static Dirtab   *dirtab[MaxQ];
81
82 #define TYPE(x)         (((ulong)(x).path) & 0xff)
83 #define PORT(x)         ((((ulong)(x).path) >> 8)&(Maxport-1))
84 #define QID(x, y)       (((x)<<8) | (y))
85
86 struct Centry
87 {
88         uchar   d[Eaddrlen];
89         int     port;
90         long    expire;         // entry expires this many seconds after bootime
91         long    src;
92         long    dst;
93 };
94
95 struct Bridge
96 {
97         QLock;
98         int     nport;
99         Port    *port[Maxport];
100         Centry  cache[CacheSize];
101         ulong   hit;
102         ulong   miss;
103         ulong   copy;
104         long    delay0;         // constant microsecond delay per packet
105         long    delayn;         // microsecond delay per byte
106         int     tcpmss;         // modify tcpmss value
107
108         Log;
109 };
110
111 struct Port
112 {
113         Ref;
114         int     id;
115         Bridge  *bridge;
116         int     closed;
117
118         Chan    *data[2];       // channel to data
119
120         Proc    *readp;         // read proc
121         
122         // the following uniquely identifies the port
123         int     type;
124         char    name[KNAMELEN];
125         
126         // owner hash - avoids bind/unbind races
127         ulong   ownhash;
128
129         // various stats
130         int     in;             // number of packets read
131         int     inmulti;        // multicast or broadcast
132         int     inunknown;      // unknown address
133         int     out;            // number of packets read
134         int     outmulti;       // multicast or broadcast
135         int     outunknown;     // unknown address
136         int     outfrag;        // fragmented the packet
137         int     nentry;         // number of cache entries for this port
138 };
139
140 enum {
141         IP_TCPPROTO     = 6,
142         EOLOPT          = 0,
143         NOOPOPT         = 1,
144         MSSOPT          = 2,
145         MSS_LENGTH      = 4,            /* Mean segment size */
146         SYN             = 0x02,         /* Pkt. is synchronise */
147         IPHDR           = 20,           /* sizeof(Iphdr) */
148 };
149
150 struct Iphdr
151 {
152         uchar   vihl;           /* Version and header length */
153         uchar   tos;            /* Type of service */
154         uchar   length[2];      /* packet length */
155         uchar   id[2];          /* ip->identification */
156         uchar   frag[2];        /* Fragment information */
157         uchar   ttl;            /* Time to live */
158         uchar   proto;          /* Protocol */
159         uchar   cksum[2];       /* Header checksum */
160         uchar   src[4];         /* IP source */
161         uchar   dst[4];         /* IP destination */
162 };
163
164 struct Tcphdr
165 {
166         uchar   sport[2];
167         uchar   dport[2];
168         uchar   seq[4];
169         uchar   ack[4];
170         uchar   flag[2];
171         uchar   win[2];
172         uchar   cksum[2];
173         uchar   urg[2];
174 };
175
176 static Bridge bridgetab[Maxbridge];
177
178 static int m2p[] = {
179         [OREAD]         4,
180         [OWRITE]        2,
181         [ORDWR]         6
182 };
183
184 static int      bridgegen(Chan *c, char*, Dirtab*, int, int s, Dir *dp);
185 static void     portbind(Bridge *b, int argc, char *argv[]);
186 static void     portunbind(Bridge *b, int argc, char *argv[]);
187 static void     etherread(void *a);
188 static char     *cachedump(Bridge *b);
189 static void     portfree(Port *port);
190 static void     cacheflushport(Bridge *b, int port);
191 static void     etherwrite(Port *port, Block *bp);
192
193 static void
194 bridgeinit(void)
195 {
196         int i;
197         Dirtab *dt;
198
199         // setup dirtab with non directory entries
200         for(i=0; i<nelem(bridgedirtab); i++) {
201                 dt = bridgedirtab + i;
202                 dirtab[TYPE(dt->qid)] = dt;
203         }
204         for(i=0; i<nelem(portdirtab); i++) {
205                 dt = portdirtab + i;
206                 dirtab[TYPE(dt->qid)] = dt;
207         }
208 }
209
210 static Chan*
211 bridgeattach(char* spec)
212 {
213         Chan *c;
214         int dev;
215
216         dev = atoi(spec);
217         if(dev<0 || dev >= Maxbridge)
218                 error("bad specification");
219
220         c = devattach('B', spec);
221         mkqid(&c->qid, QID(0, Qtopdir), 0, QTDIR);
222         c->dev = dev;
223         return c;
224 }
225
226 static Walkqid*
227 bridgewalk(Chan *c, Chan *nc, char **name, int nname)
228 {
229         return devwalk(c, nc, name, nname, (Dirtab*)0, 0, bridgegen);
230 }
231
232 static int
233 bridgestat(Chan* c, uchar* db, int n)
234 {
235         return devstat(c, db, n, (Dirtab *)0, 0L, bridgegen);
236 }
237
238 static Chan*
239 bridgeopen(Chan* c, int omode)
240 {
241         int perm;
242         Bridge *b;
243
244         omode &= 3;
245         perm = m2p[omode];
246         USED(perm);
247
248         b = bridgetab + c->dev;
249         USED(b);
250
251         switch(TYPE(c->qid)) {
252         default:
253                 break;
254         case Qlog:
255                 logopen(b);
256                 break;
257         case Qcache:
258                 c->aux = cachedump(b);
259                 break;
260         }
261         c->mode = openmode(omode);
262         c->flag |= COPEN;
263         c->offset = 0;
264         return c;
265 }
266
267 static void
268 bridgeclose(Chan* c)
269 {
270         Bridge *b  = bridgetab + c->dev;
271
272         switch(TYPE(c->qid)) {
273         case Qcache:
274                 if(c->flag & COPEN)
275                         free(c->aux);
276                 break;
277         case Qlog:
278                 if(c->flag & COPEN)
279                         logclose(b);
280                 break;
281         }
282 }
283
284 static long
285 bridgeread(Chan *c, void *a, long n, vlong off)
286 {
287         char buf[256];
288         Bridge *b = bridgetab + c->dev;
289         Port *port;
290         int i, ingood, outgood;
291
292         USED(off);
293         switch(TYPE(c->qid)) {
294         default:
295                 error(Egreg);
296         case Qtopdir:
297         case Qbridgedir:
298         case Qportdir:
299                 return devdirread(c, a, n, 0, 0, bridgegen);
300         case Qlog:
301                 return logread(b, a, off, n);
302         case Qlocal:
303                 return 0;       /* TO DO */
304         case Qstatus:
305                 qlock(b);
306                 if(waserror()){
307                         qunlock(b);
308                         nexterror();
309                 }
310                 port = b->port[PORT(c->qid)];
311                 if(port == 0)
312                         strcpy(buf, "unbound\n");
313                 else {
314                         i = 0;
315                         switch(port->type) {
316                         default:
317                                 panic("bridgeread: unknown port type: %d",
318                                         port->type);
319                         case Tether:
320                                 i += snprint(buf+i, sizeof(buf)-i, "ether %s: ", port->name);
321                                 break;
322                         case Ttun:
323                                 i += snprint(buf+i, sizeof(buf)-i, "tunnel %s: ", port->name);
324                                 break;
325                         }
326                         ingood = port->in - port->inmulti - port->inunknown;
327                         outgood = port->out - port->outmulti - port->outunknown;
328                         snprint(buf+i, sizeof(buf)-i,
329                                 "in=%d(%d:%d:%d) out=%d(%d:%d:%d:%d)\n",
330                                 port->in, ingood, port->inmulti, port->inunknown,
331                                 port->out, outgood, port->outmulti,
332                                 port->outunknown, port->outfrag);
333                 }
334                 poperror();
335                 qunlock(b);
336                 return readstr(off, a, n, buf);
337         case Qbctl:
338                 snprint(buf, sizeof(buf), "%s tcpmss\ndelay %ld %ld\n",
339                         b->tcpmss ? "set" : "clear", b->delay0, b->delayn);
340                 n = readstr(off, a, n, buf);
341                 return n;
342         case Qcache:
343                 n = readstr(off, a, n, c->aux);
344                 return n;
345         case Qstats:
346                 snprint(buf, sizeof(buf), "hit=%uld miss=%uld copy=%uld\n",
347                         b->hit, b->miss, b->copy);
348                 n = readstr(off, a, n, buf);
349                 return n;
350         }
351 }
352
353 static void
354 bridgeoption(Bridge *b, char *option, int value)
355 {
356         if(strcmp(option, "tcpmss") == 0)
357                 b->tcpmss = value;
358         else
359                 error("unknown bridge option");
360 }
361
362
363 static long
364 bridgewrite(Chan *c, void *a, long n, vlong off)
365 {
366         Bridge *b = bridgetab + c->dev;
367         Cmdbuf *cb;
368         char *arg0, *p;
369         
370         USED(off);
371         switch(TYPE(c->qid)) {
372         default:
373                 error(Eperm);
374         case Qbctl:
375                 cb = parsecmd(a, n);
376                 qlock(b);
377                 if(waserror()) {
378                         qunlock(b);
379                         free(cb);
380                         nexterror();
381                 }
382                 if(cb->nf == 0)
383                         error("short write");
384                 arg0 = cb->f[0];
385                 if(strcmp(arg0, "bind") == 0) {
386                         portbind(b, cb->nf-1, cb->f+1);
387                 } else if(strcmp(arg0, "unbind") == 0) {
388                         portunbind(b, cb->nf-1, cb->f+1);
389                 } else if(strcmp(arg0, "cacheflush") == 0) {
390                         log(b, Logcache, "cache flush\n");
391                         memset(b->cache, 0, CacheSize*sizeof(Centry));
392                 } else if(strcmp(arg0, "set") == 0) {
393                         if(cb->nf != 2)
394                                 error("usage: set option");
395                         bridgeoption(b, cb->f[1], 1);
396                 } else if(strcmp(arg0, "clear") == 0) {
397                         if(cb->nf != 2)
398                                 error("usage: clear option");
399                         bridgeoption(b, cb->f[1], 0);
400                 } else if(strcmp(arg0, "delay") == 0) {
401                         if(cb->nf != 3)
402                                 error("usage: delay delay0 delayn");
403                         b->delay0 = strtol(cb->f[1], nil, 10);
404                         b->delayn = strtol(cb->f[2], nil, 10);
405                 } else
406                         error("unknown control request");
407                 poperror();
408                 qunlock(b);
409                 free(cb);
410                 return n;
411         case Qlog:
412                 cb = parsecmd(a, n);
413                 p = logctl(b, cb->nf, cb->f, logflags);
414                 free(cb);
415                 if(p != nil)
416                         error(p);
417                 return n;
418         }
419 }
420
421 static int
422 bridgegen(Chan *c, char *, Dirtab*, int, int s, Dir *dp)
423 {
424         Bridge *b = bridgetab + c->dev;
425         int type = TYPE(c->qid);
426         Dirtab *dt;
427         Qid qid;
428
429         if(s  == DEVDOTDOT){
430                 switch(TYPE(c->qid)){
431                 case Qtopdir:
432                 case Qbridgedir:
433                         snprint(up->genbuf, sizeof(up->genbuf), "#B%ld", c->dev);
434                         mkqid(&qid, Qtopdir, 0, QTDIR);
435                         devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
436                         break;
437                 case Qportdir:
438                         snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
439                         mkqid(&qid, Qbridgedir, 0, QTDIR);
440                         devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
441                         break;
442                 default:
443                         panic("bridgewalk %llux", c->qid.path);
444                 }
445                 return 1;
446         }
447
448         switch(type) {
449         default:
450                 /* non-directory entries end up here */
451                 if(c->qid.type & QTDIR)
452                         panic("bridgegen: unexpected directory");       
453                 if(s != 0)
454                         return -1;
455                 dt = dirtab[TYPE(c->qid)];
456                 if(dt == nil)
457                         panic("bridgegen: unknown type: %lud", TYPE(c->qid));
458                 devdir(c, c->qid, dt->name, dt->length, eve, dt->perm, dp);
459                 return 1;
460         case Qtopdir:
461                 if(s != 0)
462                         return -1;
463                 snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
464                 mkqid(&qid, QID(0, Qbridgedir), 0, QTDIR);
465                 devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
466                 return 1;
467         case Qbridgedir:
468                 if(s<nelem(bridgedirtab)) {
469                         dt = bridgedirtab+s;
470                         devdir(c, dt->qid, dt->name, dt->length, eve, dt->perm, dp);
471                         return 1;
472                 }
473                 s -= nelem(bridgedirtab);
474                 if(s >= b->nport)
475                         return -1;
476                 mkqid(&qid, QID(s, Qportdir), 0, QTDIR);
477                 snprint(up->genbuf, sizeof(up->genbuf), "%d", s);
478                 devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
479                 return 1;
480         case Qportdir:
481                 if(s>=nelem(portdirtab))
482                         return -1;
483                 dt = portdirtab+s;
484                 mkqid(&qid, QID(PORT(c->qid),TYPE(dt->qid)), 0, QTFILE);
485                 devdir(c, qid, dt->name, dt->length, eve, dt->perm, dp);
486                 return 1;
487         }
488 }
489
490 // parse mac address; also in netif.c
491 static int
492 parseaddr(uchar *to, char *from, int alen)
493 {
494         char nip[4];
495         char *p;
496         int i;
497
498         p = from;
499         for(i = 0; i < alen; i++){
500                 if(*p == 0)
501                         return -1;
502                 nip[0] = *p++;
503                 if(*p == 0)
504                         return -1;
505                 nip[1] = *p++;
506                 nip[2] = 0;
507                 to[i] = strtoul(nip, 0, 16);
508                 if(*p == ':')
509                         p++;
510         }
511         return 0;
512 }
513
514 // assumes b is locked
515 static void
516 portbind(Bridge *b, int argc, char *argv[])
517 {
518         Port *port;
519         Chan *ctl;
520         int type = 0, i, n;
521         ulong ownhash;
522         char *dev, *dev2 = nil;
523         char buf[100], name[KNAMELEN], path[8*KNAMELEN];
524         static char usage[] = "usage: bind ether|tunnel name ownhash dev [dev2]";
525
526         memset(name, 0, KNAMELEN);
527         if(argc < 4)
528                 error(usage);
529         if(strcmp(argv[0], "ether") == 0) {
530                 if(argc != 4)
531                         error(usage);
532                 type = Tether;
533                 strncpy(name, argv[1], KNAMELEN);
534                 name[KNAMELEN-1] = 0;
535 //              parseaddr(addr, argv[1], Eaddrlen);
536         } else if(strcmp(argv[0], "tunnel") == 0) {
537                 if(argc != 5)
538                         error(usage);
539                 type = Ttun;
540                 strncpy(name, argv[1], KNAMELEN);
541                 name[KNAMELEN-1] = 0;
542 //              parseip(addr, argv[1]);
543                 dev2 = argv[4];
544         } else
545                 error(usage);
546         ownhash = atoi(argv[2]);
547         dev = argv[3];
548         for(i=0; i<b->nport; i++) {
549                 port = b->port[i];
550                 if(port != nil && port->type == type &&
551                     memcmp(port->name, name, KNAMELEN) == 0)
552                         error("port in use");
553         }
554         for(i=0; i<Maxport; i++)
555                 if(b->port[i] == nil)
556                         break;
557         if(i == Maxport)
558                 error("no more ports");
559         port = smalloc(sizeof(Port));
560         port->ref = 1;
561         port->id = i;
562         port->ownhash = ownhash;
563
564         if(waserror()) {
565                 portfree(port);
566                 nexterror();
567         }
568         port->type = type;
569         memmove(port->name, name, KNAMELEN);
570         switch(port->type) {
571         default:
572                 panic("portbind: unknown port type: %d", type);
573         case Tether:
574                 snprint(path, sizeof(path), "%s/clone", dev);
575                 ctl = namec(path, Aopen, ORDWR, 0);
576                 if(waserror()) {
577                         cclose(ctl);
578                         nexterror();
579                 }
580                 // check addr?
581
582                 // get directory name
583                 n = devtab[ctl->type]->read(ctl, buf, sizeof(buf)-1, 0);
584                 buf[n] = 0;
585                 snprint(path, sizeof(path), "%s/%lud/data", dev, strtoul(buf, 0, 0));
586
587                 // setup connection to be promiscuous
588                 snprint(buf, sizeof(buf), "connect -1");
589                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
590                 snprint(buf, sizeof(buf), "promiscuous");
591                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
592                 snprint(buf, sizeof(buf), "bridge");
593                 devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
594
595                 // open data port
596                 port->data[0] = namec(path, Aopen, ORDWR, 0);
597                 // dup it
598                 incref(port->data[0]);
599                 port->data[1] = port->data[0];
600
601                 poperror();
602                 cclose(ctl);            
603
604                 break;
605         case Ttun:
606                 port->data[0] = namec(dev, Aopen, OREAD, 0);
607                 port->data[1] = namec(dev2, Aopen, OWRITE, 0);
608                 break;
609         }
610
611         poperror();
612
613         /* committed to binding port */
614         b->port[port->id] = port;
615         port->bridge = b;
616         if(b->nport <= port->id)
617                 b->nport = port->id+1;
618
619         // assumes kproc always succeeds
620         incref(port);
621         snprint(buf, sizeof(buf), "bridge:%s", dev);
622         kproc(buf, etherread, port);
623 }
624
625 // assumes b is locked
626 static void
627 portunbind(Bridge *b, int argc, char *argv[])
628 {
629         int type = 0, i;
630         char name[KNAMELEN];
631         ulong ownhash;
632         Port *port = nil;
633         static char usage[] = "usage: unbind ether|tunnel addr [ownhash]";
634
635         memset(name, 0, KNAMELEN);
636         if(argc < 2 || argc > 3)
637                 error(usage);
638         if(strcmp(argv[0], "ether") == 0) {
639                 type = Tether;
640                 strncpy(name, argv[1], KNAMELEN);
641                 name[KNAMELEN-1] = 0;
642 //              parseaddr(addr, argv[1], Eaddrlen);
643         } else if(strcmp(argv[0], "tunnel") == 0) {
644                 type = Ttun;
645                 strncpy(name, argv[1], KNAMELEN);
646                 name[KNAMELEN-1] = 0;
647 //              parseip(addr, argv[1]);
648         } else
649                 error(usage);
650         if(argc == 3)
651                 ownhash = atoi(argv[2]);
652         else
653                 ownhash = 0;
654         for(i=0; i<b->nport; i++) {
655                 port = b->port[i];
656                 if(port != nil && port->type == type &&
657                     memcmp(port->name, name, KNAMELEN) == 0)
658                         break;
659         }
660         if(i == b->nport)
661                 error("port not found");
662         if(ownhash != 0 && port->ownhash != 0 && ownhash != port->ownhash)
663                 error("bad owner hash");
664
665         port->closed = 1;
666         b->port[i] = nil;       // port is now unbound
667         cacheflushport(b, i);
668
669         // try and stop reader
670         if(port->readp)
671                 postnote(port->readp, 1, "unbind", 0);
672         portfree(port);
673 }
674
675 // assumes b is locked
676 static Centry *
677 cachelookup(Bridge *b, uchar d[Eaddrlen])
678 {
679         int i;
680         uint h;
681         Centry *p;
682         long sec;
683
684         // dont cache multicast or broadcast
685         if(d[0] & 1)
686                 return 0;
687
688         h = 0;
689         for(i=0; i<Eaddrlen; i++) {
690                 h *= 7;
691                 h += d[i];
692         }
693         h %= CacheHash;
694         p = b->cache + h;
695         sec = TK2SEC(m->ticks);
696         for(i=0; i<CacheLook; i++,p++) {
697                 if(memcmp(d, p->d, Eaddrlen) == 0) {
698                         p->dst++;
699                         if(sec >= p->expire) {
700                                 log(b, Logcache, "expired cache entry: %E %d\n",
701                                         d, p->port);
702                                 return nil;
703                         }
704                         p->expire = sec + CacheTimeout;
705                         return p;
706                 }
707         }
708         log(b, Logcache, "cache miss: %E\n", d);
709         return nil;
710 }
711
712 // assumes b is locked
713 static void
714 cacheupdate(Bridge *b, uchar d[Eaddrlen], int port)
715 {
716         int i;
717         uint h;
718         Centry *p, *pp;
719         long sec;
720
721         // dont cache multicast or broadcast
722         if(d[0] & 1) {
723                 log(b, Logcache, "bad source address: %E\n", d);
724                 return;
725         }
726         
727         h = 0;
728         for(i=0; i<Eaddrlen; i++) {
729                 h *= 7;
730                 h += d[i];
731         }
732         h %= CacheHash;
733         p = b->cache + h;
734         pp = p;
735         sec = p->expire;
736
737         // look for oldest entry
738         for(i=0; i<CacheLook; i++,p++) {
739                 if(memcmp(p->d, d, Eaddrlen) == 0) {
740                         p->expire = TK2SEC(m->ticks) + CacheTimeout;
741                         if(p->port != port) {
742                                 log(b, Logcache, "NIC changed port %d->%d: %E\n",
743                                         p->port, port, d);
744                                 p->port = port;
745                         }
746                         p->src++;
747                         return;
748                 }
749                 if(p->expire < sec) {
750                         sec = p->expire;
751                         pp = p;
752                 }
753         }
754         if(pp->expire != 0)
755                 log(b, Logcache, "bumping from cache: %E %d\n", pp->d, pp->port);
756         pp->expire = TK2SEC(m->ticks) + CacheTimeout;
757         memmove(pp->d, d, Eaddrlen);
758         pp->port = port;
759         pp->src = 1;
760         pp->dst = 0;
761         log(b, Logcache, "adding to cache: %E %d\n", pp->d, pp->port);
762 }
763
764 // assumes b is locked
765 static void
766 cacheflushport(Bridge *b, int port)
767 {
768         Centry *ce;
769         int i;
770
771         ce = b->cache;
772         for(i=0; i<CacheSize; i++,ce++) {
773                 if(ce->port != port)
774                         continue;
775                 memset(ce, 0, sizeof(Centry));
776         }
777 }
778
779 static char *
780 cachedump(Bridge *b)
781 {
782         int i, n;
783         long sec, off;
784         char *buf, *p, *ep;
785         Centry *ce;
786         char c;
787
788         qlock(b);
789         if(waserror()) {
790                 qunlock(b);
791                 nexterror();
792         }
793         sec = TK2SEC(m->ticks);
794         n = 0;
795         for(i=0; i<CacheSize; i++)
796                 if(b->cache[i].expire != 0)
797                         n++;
798         
799         n *= 51;        // change if print format is changed
800         n += 10;        // some slop at the end
801         buf = malloc(n);
802         if(buf == nil)
803                 error(Enomem);
804         p = buf;
805         ep = buf + n;
806         ce = b->cache;
807         off = seconds() - sec;
808         for(i=0; i<CacheSize; i++,ce++) {
809                 if(ce->expire == 0)
810                         continue;       
811                 c = (sec < ce->expire)?'v':'e';
812                 p += snprint(p, ep-p, "%E %2d %10ld %10ld %10ld %c\n", ce->d,
813                         ce->port, ce->src, ce->dst, ce->expire+off, c);
814         }
815         *p = 0;
816         poperror();
817         qunlock(b);
818
819         return buf;
820 }
821
822
823
824 // assumes b is locked, no error return
825 static void
826 ethermultiwrite(Bridge *b, Block *bp, Port *port)
827 {
828         Port *oport;
829         Etherpkt *ep;
830         int i, mcast;
831
832         ep = (Etherpkt*)bp->rp;
833         mcast = ep->d[0] & 1;           /* multicast bit of ethernet address */
834
835         oport = nil;
836         for(i=0; i<b->nport; i++) {
837                 if(i == port->id || b->port[i] == nil)
838                         continue;
839                 /*
840                  * we need to forward multicast packets for ipv6,
841                  * so always do it.
842                  */
843                 if(mcast)
844                         b->port[i]->outmulti++;
845                 else
846                         b->port[i]->outunknown++;
847
848                 // delay one so that the last write does not copy
849                 if(oport != nil) {
850                         b->copy++;
851                         etherwrite(oport, copyblock(bp, BLEN(bp)));
852                 }
853                 oport = b->port[i];
854         }
855
856         // last write free block
857         if(oport)
858                 etherwrite(oport, bp);
859         else
860                 freeb(bp);
861 }
862
863 static void
864 tcpmsshack(Etherpkt *epkt, int n)
865 {
866         int hl, optlen;
867         Iphdr *iphdr;
868         Tcphdr *tcphdr;
869         ulong mss, cksum;
870         uchar *optr;
871
872         /* ignore non-ipv4 packets */
873         if(nhgets(epkt->type) != ETIP4)
874                 return;
875         iphdr = (Iphdr*)(epkt->data);
876         n -= ETHERHDRSIZE;
877         if(n < IPHDR)
878                 return;
879
880         /* ignore bad packets */
881         if(iphdr->vihl != (IP_VER4|IP_HLEN4)) {
882                 hl = (iphdr->vihl&0xF)<<2;
883                 if((iphdr->vihl&0xF0) != IP_VER4 || hl < (IP_HLEN4<<2))
884                         return;
885         } else
886                 hl = IP_HLEN4<<2;
887
888         /* ignore non-tcp packets */
889         if(iphdr->proto != IP_TCPPROTO)
890                 return;
891         n -= hl;
892         if(n < sizeof(Tcphdr))
893                 return;
894         tcphdr = (Tcphdr*)((uchar*)(iphdr) + hl);
895         // MSS can only appear in SYN packet
896         if(!(tcphdr->flag[1] & SYN))
897                 return;
898         hl = (tcphdr->flag[0] & 0xf0)>>2;
899         if(n < hl)
900                 return;
901
902         // check for MSS option
903         optr = (uchar*)tcphdr + sizeof(Tcphdr);
904         n = hl - sizeof(Tcphdr);
905         for(;;) {
906                 if(n <= 0 || *optr == EOLOPT)
907                         return;
908                 if(*optr == NOOPOPT) {
909                         n--;
910                         optr++;
911                         continue;
912                 }
913                 optlen = optr[1];
914                 if(optlen < 2 || optlen > n)
915                         return;
916                 if(*optr == MSSOPT && optlen == MSS_LENGTH)
917                         break;
918                 n -= optlen;
919                 optr += optlen;
920         }
921
922         mss = nhgets(optr+2);
923         if(mss <= TcpMssMax)
924                 return;
925         // fit checksum
926         cksum = nhgets(tcphdr->cksum);
927         if(optr-(uchar*)tcphdr & 1) {
928 print("tcpmsshack: odd alignment!\n");
929                 // odd alignments are a pain
930                 cksum += nhgets(optr+1);
931                 cksum -= (optr[1]<<8)|(TcpMssMax>>8);
932                 cksum += (cksum>>16);
933                 cksum &= 0xffff;
934                 cksum += nhgets(optr+3);
935                 cksum -= ((TcpMssMax&0xff)<<8)|optr[4];
936                 cksum += (cksum>>16);
937         } else {
938                 cksum += mss;
939                 cksum -= TcpMssMax;
940                 cksum += (cksum>>16);
941         }
942         hnputs(tcphdr->cksum, cksum);
943         hnputs(optr+2, TcpMssMax);
944 }
945
946 /*
947  *  process to read from the ethernet
948  */
949 static void
950 etherread(void *a)
951 {
952         Port *port = a;
953         Bridge *b = port->bridge;
954         Block *bp;
955         Etherpkt *ep;
956         Centry *ce;
957         long md, n;
958         
959         qlock(b);
960         port->readp = up;       /* hide identity under a rock for unbind */
961
962         while(!port->closed){
963                 // release lock to read - error means it is time to quit
964                 qunlock(b);
965                 if(waserror()) {
966                         print("etherread read error: %s\n", up->errstr);
967                         qlock(b);
968                         break;
969                 }
970                 bp = devtab[port->data[0]->type]->bread(port->data[0], MaxMTU, 0);
971                 poperror();
972                 qlock(b);
973                 if(bp == nil)
974                         break;
975                 n = BLEN(bp);
976                 if(port->closed || n < ETHERMINTU){
977                         freeb(bp);
978                         continue;
979                 }
980                 if(waserror()) {
981 //                      print("etherread bridge error\n");
982                         freeb(bp);
983                         continue;
984                 }
985                 port->in++;
986
987                 ep = (Etherpkt*)bp->rp;
988                 cacheupdate(b, ep->s, port->id);
989                 if(b->tcpmss)
990                         tcpmsshack(ep, n);
991
992                 /*
993                  * delay packets to simulate a slow link
994                  */
995                 if(b->delay0 != 0 || b->delayn != 0){
996                         md = b->delay0 + b->delayn * n;
997                         if(md > 0)
998                                 microdelay(md);
999                 }
1000
1001                 poperror();     /* must now dispose of bp */
1002
1003                 if(ep->d[0] & 1) {
1004                         log(b, Logmcast, "multicast: port=%d src=%E dst=%E type=%#.4ux\n",
1005                                 port->id, ep->s, ep->d, ep->type[0]<<8|ep->type[1]);
1006                         port->inmulti++;
1007                         ethermultiwrite(b, bp, port);
1008                 } else {
1009                         ce = cachelookup(b, ep->d);
1010                         if(ce == nil) {
1011                                 b->miss++;
1012                                 port->inunknown++;
1013                                 ethermultiwrite(b, bp, port);
1014                         }else if(ce->port != port->id){
1015                                 b->hit++;
1016                                 etherwrite(b->port[ce->port], bp);
1017                         }else
1018                                 freeb(bp);
1019                 }
1020         }
1021 //      print("etherread: trying to exit\n");
1022         port->readp = nil;
1023         portfree(port);
1024         qunlock(b);
1025         pexit("hangup", 1);
1026 }
1027
1028 static int
1029 fragment(Etherpkt *epkt, int n)
1030 {
1031         Iphdr *iphdr;
1032
1033         if(n <= TunnelMtu)
1034                 return 0;
1035
1036         /* ignore non-ipv4 packets */
1037         if(nhgets(epkt->type) != ETIP4)
1038                 return 0;
1039         iphdr = (Iphdr*)(epkt->data);
1040         n -= ETHERHDRSIZE;
1041         /*
1042          * ignore: IP runt packets, bad packets (I don't handle IP
1043          * options for the moment), packets with don't-fragment set,
1044          * and short blocks.
1045          */
1046         if(n < IPHDR || iphdr->vihl != (IP_VER4|IP_HLEN4) ||
1047             iphdr->frag[0] & (IP_DF>>8) || nhgets(iphdr->length) > n)
1048                 return 0;
1049
1050         return 1;
1051 }
1052
1053 static void
1054 etherwrite(Port *port, Block *bp)
1055 {
1056         Iphdr *eh, *feh;
1057         Etherpkt *epkt;
1058         int n, lid, len, seglen, dlen, blklen, mf;
1059         Block *nb;
1060         ushort fragoff, frag;
1061
1062         port->out++;
1063         n = BLEN(bp);
1064         epkt = (Etherpkt*)bp->rp;
1065         if(port->type != Ttun || !fragment(epkt, n)) {
1066                 if(!waserror()){
1067                         devtab[port->data[1]->type]->bwrite(port->data[1], bp, 0);
1068                         poperror();
1069                 }
1070                 return;
1071         }
1072         port->outfrag++;
1073         if(waserror()){
1074                 freeb(bp);      
1075                 return;
1076         }
1077
1078         seglen = (TunnelMtu - ETHERHDRSIZE - IPHDR) & ~7;
1079         eh = (Iphdr*)(epkt->data);
1080         len = nhgets(eh->length);
1081         frag = nhgets(eh->frag);
1082         mf = frag & IP_MF;
1083         frag <<= 3;
1084         dlen = len - IPHDR;
1085         lid = nhgets(eh->id);
1086         bp->rp += ETHERHDRSIZE+IPHDR;
1087         
1088         if(0)
1089                 print("seglen=%d, dlen=%d, mf=%x, frag=%d\n",
1090                         seglen, dlen, mf, frag);
1091         for(fragoff = 0; fragoff < dlen; fragoff += seglen) {
1092                 nb = allocb(ETHERHDRSIZE+IPHDR+seglen);
1093                 
1094                 feh = (Iphdr*)(nb->wp+ETHERHDRSIZE);
1095
1096                 memmove(nb->wp, epkt, ETHERHDRSIZE+IPHDR);
1097                 nb->wp += ETHERHDRSIZE+IPHDR;
1098
1099                 if((fragoff + seglen) >= dlen) {
1100                         seglen = dlen - fragoff;
1101                         hnputs(feh->frag, (frag+fragoff)>>3 | mf);
1102                 }
1103                 else    
1104                         hnputs(feh->frag, (frag+fragoff>>3) | IP_MF);
1105
1106                 hnputs(feh->length, seglen + IPHDR);
1107                 hnputs(feh->id, lid);
1108
1109                 if(seglen){
1110                         blklen = BLEN(bp);
1111                         if(seglen < blklen)
1112                                 blklen = seglen;
1113                         memmove(nb->wp, bp->rp, blklen);
1114                         nb->wp += blklen;
1115                         bp->rp += blklen;
1116                 }
1117
1118                 feh->cksum[0] = 0;
1119                 feh->cksum[1] = 0;
1120                 hnputs(feh->cksum, ipcsum(&feh->vihl));
1121         
1122                 /* don't generate small packets */
1123                 if(BLEN(nb) < ETHERMINTU)
1124                         nb = adjustblock(nb, ETHERMINTU);
1125                 devtab[port->data[1]->type]->bwrite(port->data[1], nb, 0);
1126         }
1127         poperror();
1128         freeb(bp);      
1129 }
1130
1131 // hold b lock
1132 static void
1133 portfree(Port *port)
1134 {
1135         if(decref(port) != 0)
1136                 return;
1137
1138         if(port->data[0])
1139                 cclose(port->data[0]);
1140         if(port->data[1])
1141                 cclose(port->data[1]);
1142         memset(port, 0, sizeof(Port));
1143         free(port);
1144 }
1145
1146 Dev bridgedevtab = {
1147         'B',
1148         "bridge",
1149
1150         devreset,
1151         bridgeinit,
1152         devshutdown,
1153         bridgeattach,
1154         bridgewalk,
1155         bridgestat,
1156         bridgeopen,
1157         devcreate,
1158         bridgeclose,
1159         bridgeread,
1160         devbread,
1161         bridgewrite,
1162         devbwrite,
1163         devremove,
1164         devwstat,
1165 };