10 r = alloc(sizeof(*r));
19 rcmp(const void *a1, const void *a2)
30 return p2->varno - p1->varno;
39 long initpc, val, npc;
52 regbits = RtoB(D_SP) | RtoB(D_AX);
53 for(z=0; z<BITS; z++) {
62 * build aux data structure
64 * find use and set of variables
66 val = 5L * 5L * 5L * 5L * 5L;
76 for(; p != P; p = p->link) {
114 switch(r1->prog->as) {
122 bit = mkvar(r, &p->from, p->as==AMOVL);
129 for(z=0; z<BITS; z++)
130 addrs.b[z] |= bit.b[z];
137 for(z=0; z<BITS; z++)
138 r->use1.b[z] |= bit.b[z];
142 bit = mkvar(r, &p->to, 0);
146 diag(Z, "reg: unknown op: %A", p->as);
155 for(z=0; z<BITS; z++)
156 r->use2.b[z] |= bit.b[z];
170 for(z=0; z<BITS; z++)
171 r->set.b[z] |= bit.b[z];
175 * right side read+write
216 for(z=0; z<BITS; z++) {
217 r->set.b[z] |= bit.b[z];
218 r->use2.b[z] |= bit.b[z];
231 for(z=0; z<BITS; z++)
232 addrs.b[z] |= bit.b[z];
239 if(p->to.type != D_NONE)
255 r->regu |= RtoB(D_AX) | RtoB(D_DX);
263 r->regu |= RtoB(D_CX);
272 r->regu |= RtoB(D_SI) | RtoB(D_DI);
281 r->regu |= RtoB(D_AX) | RtoB(D_DI);
290 r->regu |= RtoB(D_DI) | RtoB(D_DX);
295 r->regu |= RtoB(D_AX);
306 * turn branch references to pointers
307 * build back pointers
309 for(r = firstr; r != R; r = r->link) {
311 if(p->to.type == D_BRANCH) {
312 val = p->to.offset - initpc;
316 if(r2 != R && val >= r2->pc) {
326 diag(Z, "ref not found\n%P", p);
331 diag(Z, "ref to self\n%P", p);
341 print("\n%L %D\n", p->lineno, &p->from);
346 * find looping structure
348 for(r = firstr; r != R; r = r->link)
352 if(debug['R'] && debug['v']) {
353 print("\nlooping structure:\n");
354 for(r = firstr; r != R; r = r->link) {
355 print("%ld:%P", r->loop, r->prog);
356 for(z=0; z<BITS; z++)
357 bit.b[z] = r->use1.b[z] |
363 print(" u1=%B", r->use1);
365 print(" u2=%B", r->use2);
367 print(" st=%B", r->set);
375 * iterate propagating usage
376 * back until flow graph is complete
380 for(r = firstr; r != R; r = r->link)
382 for(r = firstr; r != R; r = r->link)
383 if(r->prog->as == ARET)
384 prop(r, zbits, zbits);
386 /* pick up unreachable code */
388 for(r = firstr; r != R; r = r1) {
390 if(r1 && r1->active && !r->active) {
391 prop(r, zbits, zbits);
403 * iterate propagating register/variable synchrony
404 * forward until graph is complete
408 for(r = firstr; r != R; r = r->link)
410 synch(firstr, zbits);
418 * calculate costs (paint1)
422 for(z=0; z<BITS; z++)
423 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
424 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
426 nearln = r->prog->lineno;
427 warn(Z, "used and not set: %B", bit);
428 if(debug['R'] && !debug['w'])
429 print("used and not set: %B\n", bit);
432 if(debug['R'] && debug['v'])
433 print("\nprop structure:\n");
434 for(r = firstr; r != R; r = r->link)
438 for(r = firstr; r != R; r = r->link) {
439 if(debug['R'] && debug['v']) {
440 print("%P\t", r->prog);
442 print("s:%B ", r->set);
443 if(bany(&r->refahead))
444 print("ra:%B ", r->refahead);
445 if(bany(&r->calahead))
446 print("ca:%B ", r->calahead);
449 for(z=0; z<BITS; z++)
450 bit.b[z] = r->set.b[z] &
451 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
453 nearln = r->prog->lineno;
454 warn(Z, "set and not used: %B", bit);
456 print("set and not used: %B\n", bit);
459 for(z=0; z<BITS; z++)
460 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
466 if(debug['R'] && debug['v'])
469 bit.b[i/32] &= ~(1L<<(i%32));
473 r->prog->lineno, change, blsh(i));
478 if(nregion >= NRGN) {
479 warn(Z, "too many regions");
486 qsort(region, nregion, sizeof(region[0]), rcmp);
490 * determine used registers (paint2)
491 * replace code (paint3)
494 for(i=0; i<nregion; i++) {
495 bit = blsh(rgp->varno);
496 vreg = paint2(rgp->enter, rgp->varno);
497 vreg = allreg(vreg, rgp);
499 print("%L$%d %R: %B\n",
500 rgp->enter->prog->lineno,
506 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
511 * peep-hole on basic block
513 if(!debug['R'] || debug['P'])
521 for(r = firstr; r != R; r = r1) {
528 for(; p != p1; p = p->link) {
550 print("addrs: %B\n", addrs);
553 for(r = firstr; r != R; r = r->link) {
555 if(p->to.type == D_BRANCH)
556 p->to.offset = r->s2->pc;
563 * free aux structures
565 for(p = firstr->prog; p != P; p = p->link){
566 while(p->link && p->link->as == ANOP)
567 p->link = p->link->link;
580 addmove(Reg *r, int bn, int rn, int f)
586 p1 = alloc(sizeof(*p1));
592 p1->lineno = p->lineno;
598 a->offset = v->offset;
603 if(v->etype == TCHAR || v->etype == TUCHAR)
605 if(v->etype == TSHORT || v->etype == TUSHORT)
613 if(v->etype == TUCHAR)
615 if(v->etype == TUSHORT)
619 print("%P\t.a%P\n", p, p1);
630 if(r >= D_AX && r <= D_DI)
633 if(r >= D_AL && r <= D_BL)
634 b |= RtoB(r-D_AL+D_AX);
636 if(r >= D_AH && r <= D_BH)
637 b |= RtoB(r-D_AH+D_AX);
642 mkvar(Reg *r, Adr *a, int isro)
651 * mark registers used
654 r->regu |= doregbits(t);
655 r->regu |= doregbits(a->index);
665 {static Sym er; a->sym = &er;}
666 a->sym->name = "$extreg";
670 bit = mkvar(r, a, 0);
671 for(z=0; z<BITS; z++)
672 addrs.b[z] |= bit.b[z];
685 if(s->name[0] == '.')
689 for(i=0; i<nvar; i++) {
697 if(debug['w'] > 1 && s)
698 warn(Z, "variable not optimized: %s", s->name);
709 print("bit=%2d et=%2d %D\n", i, et, a);
713 if(n == D_EXTERN || n == D_STATIC)
714 for(z=0; z<BITS; z++)
715 externs.b[z] |= bit.b[z];
717 for(z=0; z<BITS; z++)
718 params.b[z] |= bit.b[z];
719 if(v->etype != et || !typechlpfd[et]) /* funny punning */
720 for(z=0; z<BITS; z++)
721 addrs.b[z] |= bit.b[z];
729 prop(Reg *r, Bits ref, Bits cal)
734 for(r1 = r; r1 != R; r1 = r1->p1) {
735 for(z=0; z<BITS; z++) {
736 ref.b[z] |= r1->refahead.b[z];
737 if(ref.b[z] != r1->refahead.b[z]) {
738 r1->refahead.b[z] = ref.b[z];
741 cal.b[z] |= r1->calahead.b[z];
742 if(cal.b[z] != r1->calahead.b[z]) {
743 r1->calahead.b[z] = cal.b[z];
747 switch(r1->prog->as) {
749 for(z=0; z<BITS; z++) {
750 cal.b[z] |= ref.b[z] | externs.b[z];
756 for(z=0; z<BITS; z++) {
763 for(z=0; z<BITS; z++) {
764 cal.b[z] = externs.b[z];
768 for(z=0; z<BITS; z++) {
769 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
770 r1->use1.b[z] | r1->use2.b[z];
771 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
772 r1->refbehind.b[z] = ref.b[z];
773 r1->calbehind.b[z] = cal.b[z];
779 for(; r != r1; r = r->p1)
780 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
781 prop(r2, r->refbehind, r->calbehind);
785 * find looping structure
787 * 1) find reverse postordering
788 * 2) find approximate dominators,
789 * the actual dominators if the flow graph is reducible
790 * otherwise, dominators plus some other non-dominators.
791 * See Matthew S. Hecht and Jeffrey D. Ullman,
792 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
793 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
794 * Oct. 1-3, 1973, pp. 207-217.
795 * 3) find all nodes with a predecessor dominated by the current node.
796 * such a node is a loop head.
797 * recursively, all preds with a greater rpo number are in the loop
800 postorder(Reg *r, Reg **rpo2r, long n)
807 n = postorder(r1, rpo2r, n);
810 n = postorder(r1, rpo2r, n);
817 rpolca(long *idom, long rpo1, long rpo2)
832 fatal(Z, "bad idom");
840 doms(long *idom, long r, long s)
848 loophead(long *idom, Reg *r)
853 if(r->p1 != R && doms(idom, src, r->p1->rpo))
855 for(r = r->p2; r != R; r = r->p2link)
856 if(doms(idom, src, r->rpo))
862 loopmark(Reg **rpo2r, long head, Reg *r)
864 if(r->rpo < head || r->active == head)
869 loopmark(rpo2r, head, r->p1);
870 for(r = r->p2; r != R; r = r->p2link)
871 loopmark(rpo2r, head, r);
875 loopit(Reg *r, long nr)
881 rpo2r = alloc(nr * sizeof(Reg*));
882 idom = alloc(nr * sizeof(long));
886 d = postorder(r, rpo2r, 0);
888 fatal(Z, "too many reg nodes");
890 for(i = 0; i < nr / 2; i++){
892 rpo2r[i] = rpo2r[nr - 1 - i];
893 rpo2r[nr - 1 - i] = r1;
895 for(i = 0; i < nr; i++)
899 for(i = 0; i < nr; i++){
903 if(r1->p1 != R && r1->p1->rpo < me)
905 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
907 d = rpolca(idom, d, r1->rpo);
911 for(i = 0; i < nr; i++){
914 if(r1->p2 != R && loophead(idom, r1))
915 loopmark(rpo2r, i, r1);
920 synch(Reg *r, Bits dif)
925 for(r1 = r; r1 != R; r1 = r1->s1) {
926 for(z=0; z<BITS; z++) {
927 dif.b[z] = (dif.b[z] &
928 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
929 r1->set.b[z] | r1->regdiff.b[z];
930 if(dif.b[z] != r1->regdiff.b[z]) {
931 r1->regdiff.b[z] = dif.b[z];
938 for(z=0; z<BITS; z++)
939 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
946 allreg(ulong b, Rgn *r)
956 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
970 if(i && r->cost > 0) {
984 paint1(Reg *r, int bn)
996 if(!(r->refbehind.b[z] & bb))
1001 if(!(r1->refahead.b[z] & bb))
1003 if(r1->act.b[z] & bb)
1008 if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1009 change -= CLOAD * r->loop;
1010 if(debug['R'] && debug['v'])
1011 print("%ld%P\tld %B $%d\n", r->loop,
1012 r->prog, blsh(bn), change);
1018 if(r->use1.b[z] & bb) {
1019 change += CREF * r->loop;
1020 if(p->as == AFMOVL || p->as == AFMOVW)
1021 if(BtoR(bb) != D_F0)
1023 if(debug['R'] && debug['v'])
1024 print("%ld%P\tu1 %B $%d\n", r->loop,
1025 p, blsh(bn), change);
1028 if((r->use2.b[z]|r->set.b[z]) & bb) {
1029 change += CREF * r->loop;
1030 if(p->as == AFMOVL || p->as == AFMOVW)
1031 if(BtoR(bb) != D_F0)
1033 if(debug['R'] && debug['v'])
1034 print("%ld%P\tu2 %B $%d\n", r->loop,
1035 p, blsh(bn), change);
1038 if(STORE(r) & r->regdiff.b[z] & bb) {
1039 change -= CLOAD * r->loop;
1040 if(p->as == AFMOVL || p->as == AFMOVW)
1041 if(BtoR(bb) != D_F0)
1043 if(debug['R'] && debug['v'])
1044 print("%ld%P\tst %B $%d\n", r->loop,
1045 p, blsh(bn), change);
1048 if(r->refbehind.b[z] & bb)
1049 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1050 if(r1->refahead.b[z] & bb)
1053 if(!(r->refahead.b[z] & bb))
1057 if(r1->refbehind.b[z] & bb)
1062 if(r->act.b[z] & bb)
1064 if(!(r->refbehind.b[z] & bb))
1070 regset(Reg *r, ulong bb)
1078 while(b = bb & ~(bb-1)) {
1080 c = copyu(r->prog, &v, A);
1089 reguse(Reg *r, ulong bb)
1097 while(b = bb & ~(bb-1)) {
1099 c = copyu(r->prog, &v, A);
1100 if(c == 1 || c == 2 || c == 4)
1108 paint2(Reg *r, int bn)
1117 if(!(r->act.b[z] & bb))
1120 if(!(r->refbehind.b[z] & bb))
1125 if(!(r1->refahead.b[z] & bb))
1127 if(!(r1->act.b[z] & bb))
1136 if(r->refbehind.b[z] & bb)
1137 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1138 if(r1->refahead.b[z] & bb)
1139 vreg |= paint2(r1, bn);
1141 if(!(r->refahead.b[z] & bb))
1145 if(r1->refbehind.b[z] & bb)
1146 vreg |= paint2(r1, bn);
1150 if(!(r->act.b[z] & bb))
1152 if(!(r->refbehind.b[z] & bb))
1160 vreg |= reguse(r, x);
1168 paint3(Reg *r, int bn, long rb, int rn)
1177 if(r->act.b[z] & bb)
1180 if(!(r->refbehind.b[z] & bb))
1185 if(!(r1->refahead.b[z] & bb))
1187 if(r1->act.b[z] & bb)
1192 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1193 addmove(r, bn, rn, 0);
1198 if(r->use1.b[z] & bb) {
1201 addreg(&p->from, rn);
1203 print("\t.c%P\n", p);
1205 if((r->use2.b[z]|r->set.b[z]) & bb) {
1210 print("\t.c%P\n", p);
1213 if(STORE(r) & r->regdiff.b[z] & bb)
1214 addmove(r, bn, rn, 1);
1217 if(r->refbehind.b[z] & bb)
1218 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1219 if(r1->refahead.b[z] & bb)
1220 paint3(r1, bn, rb, rn);
1222 if(!(r->refahead.b[z] & bb))
1226 if(r1->refbehind.b[z] & bb)
1227 paint3(r1, bn, rb, rn);
1231 if(r->act.b[z] & bb)
1233 if(!(r->refbehind.b[z] & bb))
1239 addreg(Adr *a, int rn)
1251 if(r < D_AX || r > D_DI)
1253 return 1L << (r-D_AX);
1263 return bitno(b) + D_AX;