10 r = alloc(sizeof(*r));
19 rcmp(const void *a1, const void *a2)
30 return p2->varno - p1->varno;
39 long initpc, val, npc;
52 regbits = RtoB(D_SP) | RtoB(D_AX);
53 for(z=0; z<BITS; z++) {
62 * build aux data structure
64 * find use and set of variables
66 val = 5L * 5L * 5L * 5L * 5L;
76 for(; p != P; p = p->link) {
114 switch(r1->prog->as) {
122 bit = mkvar(r, &p->from);
129 for(z=0; z<BITS; z++)
130 addrs.b[z] |= bit.b[z];
137 for(z=0; z<BITS; z++)
138 r->use1.b[z] |= bit.b[z];
142 bit = mkvar(r, &p->to);
146 diag(Z, "reg: unknown op: %A", p->as);
155 for(z=0; z<BITS; z++)
156 r->use2.b[z] |= bit.b[z];
170 for(z=0; z<BITS; z++)
171 r->set.b[z] |= bit.b[z];
175 * right side read+write
216 for(z=0; z<BITS; z++) {
217 r->set.b[z] |= bit.b[z];
218 r->use2.b[z] |= bit.b[z];
229 for(z=0; z<BITS; z++)
230 addrs.b[z] |= bit.b[z];
237 if(p->to.type != D_NONE)
253 r->regu |= RtoB(D_AX) | RtoB(D_DX);
261 r->regu |= RtoB(D_CX);
270 r->regu |= RtoB(D_SI) | RtoB(D_DI);
279 r->regu |= RtoB(D_AX) | RtoB(D_DI);
288 r->regu |= RtoB(D_DI) | RtoB(D_DX);
293 r->regu |= RtoB(D_AX);
304 * turn branch references to pointers
305 * build back pointers
307 for(r = firstr; r != R; r = r->link) {
309 if(p->to.type == D_BRANCH) {
310 val = p->to.offset - initpc;
314 if(r2 != R && val >= r2->pc) {
324 diag(Z, "ref not found\n%P", p);
329 diag(Z, "ref to self\n%P", p);
339 print("\n%L %D\n", p->lineno, &p->from);
344 * find looping structure
346 for(r = firstr; r != R; r = r->link)
350 if(debug['R'] && debug['v']) {
351 print("\nlooping structure:\n");
352 for(r = firstr; r != R; r = r->link) {
353 print("%ld:%P", r->loop, r->prog);
354 for(z=0; z<BITS; z++)
355 bit.b[z] = r->use1.b[z] |
361 print(" u1=%B", r->use1);
363 print(" u2=%B", r->use2);
365 print(" st=%B", r->set);
373 * iterate propagating usage
374 * back until flow graph is complete
378 for(r = firstr; r != R; r = r->link)
380 for(r = firstr; r != R; r = r->link)
381 if(r->prog->as == ARET)
382 prop(r, zbits, zbits);
384 /* pick up unreachable code */
386 for(r = firstr; r != R; r = r1) {
388 if(r1 && r1->active && !r->active) {
389 prop(r, zbits, zbits);
401 * iterate propagating register/variable synchrony
402 * forward until graph is complete
406 for(r = firstr; r != R; r = r->link)
408 synch(firstr, zbits);
416 * calculate costs (paint1)
420 for(z=0; z<BITS; z++)
421 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
422 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
424 nearln = r->prog->lineno;
425 warn(Z, "used and not set: %B", bit);
426 if(debug['R'] && !debug['w'])
427 print("used and not set: %B\n", bit);
430 if(debug['R'] && debug['v'])
431 print("\nprop structure:\n");
432 for(r = firstr; r != R; r = r->link)
436 for(r = firstr; r != R; r = r->link) {
437 if(debug['R'] && debug['v']) {
438 print("%P\t", r->prog);
440 print("s:%B ", r->set);
441 if(bany(&r->refahead))
442 print("ra:%B ", r->refahead);
443 if(bany(&r->calahead))
444 print("ca:%B ", r->calahead);
447 for(z=0; z<BITS; z++)
448 bit.b[z] = r->set.b[z] &
449 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
451 nearln = r->prog->lineno;
452 warn(Z, "set and not used: %B", bit);
454 print("set and not used: %B\n", bit);
457 for(z=0; z<BITS; z++)
458 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
464 if(debug['R'] && debug['v'])
467 bit.b[i/32] &= ~(1L<<(i%32));
471 r->prog->lineno, change, blsh(i));
476 if(nregion >= NRGN) {
477 warn(Z, "too many regions");
484 qsort(region, nregion, sizeof(region[0]), rcmp);
488 * determine used registers (paint2)
489 * replace code (paint3)
492 for(i=0; i<nregion; i++) {
493 bit = blsh(rgp->varno);
494 vreg = paint2(rgp->enter, rgp->varno);
495 vreg = allreg(vreg, rgp);
497 print("%L$%d %R: %B\n",
498 rgp->enter->prog->lineno,
504 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
509 * peep-hole on basic block
511 if(!debug['R'] || debug['P'])
519 for(r = firstr; r != R; r = r1) {
526 for(; p != p1; p = p->link) {
548 print("addrs: %B\n", addrs);
551 for(r = firstr; r != R; r = r->link) {
553 if(p->to.type == D_BRANCH)
554 p->to.offset = r->s2->pc;
561 * free aux structures
563 for(p = firstr->prog; p != P; p = p->link){
564 while(p->link && p->link->as == ANOP)
565 p->link = p->link->link;
578 addmove(Reg *r, int bn, int rn, int f)
584 p1 = alloc(sizeof(*p1));
590 p1->lineno = p->lineno;
596 a->offset = v->offset;
601 if(v->etype == TCHAR || v->etype == TUCHAR)
603 if(v->etype == TSHORT || v->etype == TUSHORT)
611 if(v->etype == TUCHAR)
613 if(v->etype == TUSHORT)
617 print("%P\t.a%P\n", p, p1);
628 if(r >= D_AX && r <= D_DI)
631 if(r >= D_AL && r <= D_BL)
632 b |= RtoB(r-D_AL+D_AX);
634 if(r >= D_AH && r <= D_BH)
635 b |= RtoB(r-D_AH+D_AX);
640 mkvar(Reg *r, Adr *a)
649 * mark registers used
652 r->regu |= doregbits(t);
653 r->regu |= doregbits(a->index);
661 for(z=0; z<BITS; z++)
662 addrs.b[z] |= bit.b[z];
675 if(s->name[0] == '.')
680 for(i=0; i<nvar; i++) {
688 if(debug['w'] > 1 && s)
689 warn(Z, "variable not optimized: %s", s->name);
700 print("bit=%2d et=%2d %D\n", i, et, a);
704 if(n == D_EXTERN || n == D_STATIC)
705 for(z=0; z<BITS; z++)
706 externs.b[z] |= bit.b[z];
708 for(z=0; z<BITS; z++)
709 params.b[z] |= bit.b[z];
710 if(v->etype != et || !typechlpfd[et]) /* funny punning */
711 for(z=0; z<BITS; z++)
712 addrs.b[z] |= bit.b[z];
720 prop(Reg *r, Bits ref, Bits cal)
725 for(r1 = r; r1 != R; r1 = r1->p1) {
726 for(z=0; z<BITS; z++) {
727 ref.b[z] |= r1->refahead.b[z];
728 if(ref.b[z] != r1->refahead.b[z]) {
729 r1->refahead.b[z] = ref.b[z];
732 cal.b[z] |= r1->calahead.b[z];
733 if(cal.b[z] != r1->calahead.b[z]) {
734 r1->calahead.b[z] = cal.b[z];
738 switch(r1->prog->as) {
740 for(z=0; z<BITS; z++) {
741 cal.b[z] |= ref.b[z] | externs.b[z];
747 for(z=0; z<BITS; z++) {
754 for(z=0; z<BITS; z++) {
755 cal.b[z] = externs.b[z];
759 for(z=0; z<BITS; z++) {
760 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
761 r1->use1.b[z] | r1->use2.b[z];
762 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
763 r1->refbehind.b[z] = ref.b[z];
764 r1->calbehind.b[z] = cal.b[z];
770 for(; r != r1; r = r->p1)
771 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
772 prop(r2, r->refbehind, r->calbehind);
776 * find looping structure
778 * 1) find reverse postordering
779 * 2) find approximate dominators,
780 * the actual dominators if the flow graph is reducible
781 * otherwise, dominators plus some other non-dominators.
782 * See Matthew S. Hecht and Jeffrey D. Ullman,
783 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
784 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
785 * Oct. 1-3, 1973, pp. 207-217.
786 * 3) find all nodes with a predecessor dominated by the current node.
787 * such a node is a loop head.
788 * recursively, all preds with a greater rpo number are in the loop
791 postorder(Reg *r, Reg **rpo2r, long n)
798 n = postorder(r1, rpo2r, n);
801 n = postorder(r1, rpo2r, n);
808 rpolca(long *idom, long rpo1, long rpo2)
823 fatal(Z, "bad idom");
831 doms(long *idom, long r, long s)
839 loophead(long *idom, Reg *r)
844 if(r->p1 != R && doms(idom, src, r->p1->rpo))
846 for(r = r->p2; r != R; r = r->p2link)
847 if(doms(idom, src, r->rpo))
853 loopmark(Reg **rpo2r, long head, Reg *r)
855 if(r->rpo < head || r->active == head)
860 loopmark(rpo2r, head, r->p1);
861 for(r = r->p2; r != R; r = r->p2link)
862 loopmark(rpo2r, head, r);
866 loopit(Reg *r, long nr)
872 rpo2r = alloc(nr * sizeof(Reg*));
873 idom = alloc(nr * sizeof(long));
877 d = postorder(r, rpo2r, 0);
879 fatal(Z, "too many reg nodes");
881 for(i = 0; i < nr / 2; i++){
883 rpo2r[i] = rpo2r[nr - 1 - i];
884 rpo2r[nr - 1 - i] = r1;
886 for(i = 0; i < nr; i++)
890 for(i = 0; i < nr; i++){
894 if(r1->p1 != R && r1->p1->rpo < me)
896 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
898 d = rpolca(idom, d, r1->rpo);
902 for(i = 0; i < nr; i++){
905 if(r1->p2 != R && loophead(idom, r1))
906 loopmark(rpo2r, i, r1);
911 synch(Reg *r, Bits dif)
916 for(r1 = r; r1 != R; r1 = r1->s1) {
917 for(z=0; z<BITS; z++) {
918 dif.b[z] = (dif.b[z] &
919 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
920 r1->set.b[z] | r1->regdiff.b[z];
921 if(dif.b[z] != r1->regdiff.b[z]) {
922 r1->regdiff.b[z] = dif.b[z];
929 for(z=0; z<BITS; z++)
930 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
937 allreg(ulong b, Rgn *r)
947 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
961 if(i && r->cost > 0) {
975 paint1(Reg *r, int bn)
987 if(!(r->refbehind.b[z] & bb))
992 if(!(r1->refahead.b[z] & bb))
994 if(r1->act.b[z] & bb)
999 if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1000 change -= CLOAD * r->loop;
1001 if(debug['R'] && debug['v'])
1002 print("%ld%P\tld %B $%d\n", r->loop,
1003 r->prog, blsh(bn), change);
1009 if(r->use1.b[z] & bb) {
1010 change += CREF * r->loop;
1012 if(BtoR(bb) != D_F0)
1014 if(debug['R'] && debug['v'])
1015 print("%ld%P\tu1 %B $%d\n", r->loop,
1016 p, blsh(bn), change);
1019 if((r->use2.b[z]|r->set.b[z]) & bb) {
1020 change += CREF * r->loop;
1022 if(BtoR(bb) != D_F0)
1024 if(debug['R'] && debug['v'])
1025 print("%ld%P\tu2 %B $%d\n", r->loop,
1026 p, blsh(bn), change);
1029 if(STORE(r) & r->regdiff.b[z] & bb) {
1030 change -= CLOAD * r->loop;
1032 if(BtoR(bb) != D_F0)
1034 if(debug['R'] && debug['v'])
1035 print("%ld%P\tst %B $%d\n", r->loop,
1036 p, blsh(bn), change);
1039 if(r->refbehind.b[z] & bb)
1040 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1041 if(r1->refahead.b[z] & bb)
1044 if(!(r->refahead.b[z] & bb))
1048 if(r1->refbehind.b[z] & bb)
1053 if(r->act.b[z] & bb)
1055 if(!(r->refbehind.b[z] & bb))
1061 regset(Reg *r, ulong bb)
1069 while(b = bb & ~(bb-1)) {
1071 c = copyu(r->prog, &v, A);
1080 reguse(Reg *r, ulong bb)
1088 while(b = bb & ~(bb-1)) {
1090 c = copyu(r->prog, &v, A);
1091 if(c == 1 || c == 2 || c == 4)
1099 paint2(Reg *r, int bn)
1108 if(!(r->act.b[z] & bb))
1111 if(!(r->refbehind.b[z] & bb))
1116 if(!(r1->refahead.b[z] & bb))
1118 if(!(r1->act.b[z] & bb))
1127 if(r->refbehind.b[z] & bb)
1128 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1129 if(r1->refahead.b[z] & bb)
1130 vreg |= paint2(r1, bn);
1132 if(!(r->refahead.b[z] & bb))
1136 if(r1->refbehind.b[z] & bb)
1137 vreg |= paint2(r1, bn);
1141 if(!(r->act.b[z] & bb))
1143 if(!(r->refbehind.b[z] & bb))
1151 vreg |= reguse(r, x);
1159 paint3(Reg *r, int bn, long rb, int rn)
1168 if(r->act.b[z] & bb)
1171 if(!(r->refbehind.b[z] & bb))
1176 if(!(r1->refahead.b[z] & bb))
1178 if(r1->act.b[z] & bb)
1183 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1184 addmove(r, bn, rn, 0);
1189 if(r->use1.b[z] & bb) {
1192 addreg(&p->from, rn);
1194 print("\t.c%P\n", p);
1196 if((r->use2.b[z]|r->set.b[z]) & bb) {
1201 print("\t.c%P\n", p);
1204 if(STORE(r) & r->regdiff.b[z] & bb)
1205 addmove(r, bn, rn, 1);
1208 if(r->refbehind.b[z] & bb)
1209 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1210 if(r1->refahead.b[z] & bb)
1211 paint3(r1, bn, rb, rn);
1213 if(!(r->refahead.b[z] & bb))
1217 if(r1->refbehind.b[z] & bb)
1218 paint3(r1, bn, rb, rn);
1222 if(r->act.b[z] & bb)
1224 if(!(r->refbehind.b[z] & bb))
1230 addreg(Adr *a, int rn)
1242 if(r < D_AX || r > D_DI)
1244 return 1L << (r-D_AX);
1254 return bitno(b) + D_AX;