12 r = alloc(sizeof(*r));
21 rcmp(const void *a1, const void *a2)
32 return p2->varno - p1->varno;
41 long initpc, val, npc;
55 for(z=0; z<BITS; z++) {
64 * build aux data structure
66 * find use and set of variables
68 val = 5L * 5L * 5L * 5L * 5L;
78 for(; p != P; p = p->link) {
116 switch(r1->prog->as) {
125 * left side always read
127 bit = mkvar(&p->from, p->as==AMOVW);
128 for(z=0; z<BITS; z++)
129 r->use1.b[z] |= bit.b[z];
132 * right side depends on opcode
134 bit = mkvar(&p->to, 0);
138 diag(Z, "reg: unknown asop: %A", p->as);
152 for(z=0; z<BITS; z++)
153 r->set.b[z] |= bit.b[z];
160 for(z=0; z<BITS; z++)
161 addrs.b[z] |= bit.b[z];
172 * turn branch references to pointers
173 * build back pointers
175 for(r = firstr; r != R; r = r->link) {
177 if(p->to.type == D_BRANCH) {
178 val = p->to.offset - initpc;
182 if(r2 != R && val >= r2->pc) {
192 diag(Z, "ref not found\n%P", p);
197 diag(Z, "ref to self\n%P", p);
207 print("\n%L %D\n", p->lineno, &p->from);
212 * find looping structure
214 for(r = firstr; r != R; r = r->link)
221 * iterate propagating usage
222 * back until flow graph is complete
226 for(r = firstr; r != R; r = r->link)
228 for(r = firstr; r != R; r = r->link)
229 if(r->prog->as == ARET)
230 prop(r, zbits, zbits);
232 /* pick up unreachable code */
234 for(r = firstr; r != R; r = r1) {
236 if(r1 && r1->active && !r->active) {
237 prop(r, zbits, zbits);
249 * iterate propagating register/variable synchrony
250 * forward until graph is complete
254 for(r = firstr; r != R; r = r->link)
256 synch(firstr, zbits);
262 if(debug['R'] && debug['v']) {
263 print("\nprop structure:\n");
264 for(r = firstr; r != R; r = r->link) {
265 print("%ld:%P", r->loop, r->prog);
266 for(z=0; z<BITS; z++)
267 bit.b[z] = r->set.b[z] |
268 r->refahead.b[z] | r->calahead.b[z] |
269 r->refbehind.b[z] | r->calbehind.b[z] |
270 r->use1.b[z] | r->use2.b[z];
274 print(" u1=%B", r->use1);
276 print(" u2=%B", r->use2);
278 print(" st=%B", r->set);
279 if(bany(&r->refahead))
280 print(" ra=%B", r->refahead);
281 if(bany(&r->calahead))
282 print(" ca=%B", r->calahead);
283 if(bany(&r->refbehind))
284 print(" rb=%B", r->refbehind);
285 if(bany(&r->calbehind))
286 print(" cb=%B", r->calbehind);
287 if(bany(&r->regdiff))
288 print(" rd=%B", r->regdiff);
297 * calculate costs (paint1)
301 for(z=0; z<BITS; z++)
302 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
303 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
305 nearln = r->prog->lineno;
306 warn(Z, "used and not set: %B", bit);
307 if(debug['R'] && !debug['w'])
308 print("used and not set: %B\n", bit);
312 for(r = firstr; r != R; r = r->link)
316 for(r = firstr; r != R; r = r->link) {
317 for(z=0; z<BITS; z++)
318 bit.b[z] = r->set.b[z] &
319 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
321 nearln = r->prog->lineno;
322 warn(Z, "set and not used: %B", bit);
324 print("set and not used: %B\n", bit);
327 for(z=0; z<BITS; z++)
328 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
334 if(debug['R'] && debug['v'])
337 bit.b[i/32] &= ~(1L<<(i%32));
340 print("%L $%d: %B\n",
341 r->prog->lineno, change, blsh(i));
346 if(nregion >= NRGN) {
347 warn(Z, "too many regions");
354 qsort(region, nregion, sizeof(region[0]), rcmp);
358 * determine used registers (paint2)
359 * replace code (paint3)
362 for(i=0; i<nregion; i++) {
363 bit = blsh(rgp->varno);
364 vreg = paint2(rgp->enter, rgp->varno);
365 vreg = allreg(vreg, rgp);
367 if(rgp->regno >= NREG)
368 print("%L $%d F%d: %B\n",
369 rgp->enter->prog->lineno,
374 print("%L $%d R%d: %B\n",
375 rgp->enter->prog->lineno,
381 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
386 * peep-hole on basic block
388 if(!debug['R'] || debug['P'])
396 for(r = firstr; r != R; r = r1) {
403 for(; p != p1; p = p->link) {
425 print("addrs: %B\n", addrs);
428 for(r = firstr; r != R; r = r->link) {
430 if(p->to.type == D_BRANCH)
431 p->to.offset = r->s2->pc;
438 * free aux structures
440 for(p = firstr->prog; p != P; p = p->link){
441 while(p->link && p->link->as == ANOP)
442 p->link = p->link->link;
457 for(r = firstr; r != R; r = r->link) {
460 if(r->prog->as == AJAL)
462 for(r1 = r->p2; r1 != R; r1 = r1->p2link) {
465 for(z=0; z<BITS; z++)
466 bit.b[z] = r1->calbehind.b[z] &
467 (r->refahead.b[z] | r->use1.b[z] | r->use2.b[z]) &
468 ~(r->calahead.b[z] & addrs.b[z]);
471 bit.b[i/32] &= ~(1L << (i%32));
482 addmove(Reg *r, int bn, int rn, int f)
488 p1 = alloc(sizeof(*p1));
494 p1->lineno = p->lineno;
501 a->offset = v->offset;
504 if(a->etype == TARRAY || a->sym == S)
508 if(v->etype == TCHAR || v->etype == TUCHAR)
510 if(v->etype == TSHORT || v->etype == TUSHORT)
512 if(v->etype == TFLOAT)
514 if(v->etype == TDOUBLE)
517 p1->from.type = D_REG;
520 p1->from.type = D_FREG;
521 p1->from.reg = rn-NREG;
532 if(v->etype == TUCHAR)
534 if(v->etype == TUSHORT)
538 print("%P\t.a%P\n", p, p1);
542 mkvar(Adr *a, int docon)
551 if(t == D_REG && a->reg != NREG)
552 regbits |= RtoB(a->reg);
553 if(t == D_FREG && a->reg != NREG)
554 regbits |= FtoB(a->reg);
559 if(t != D_CONST || !docon || a->reg != NREG)
564 if(s == S && sval(o))
570 for(i=0; i<nvar; i++) {
578 if(s->name[0] == '.')
581 if(debug['w'] > 1 && s)
582 warn(Z, "variable not optimized: %s", s->name);
593 print("bit=%2d et=%2d %D\n", i, et, a);
596 if(n == D_EXTERN || n == D_STATIC)
597 for(z=0; z<BITS; z++)
598 externs.b[z] |= bit.b[z];
600 for(z=0; z<BITS; z++)
601 params.b[z] |= bit.b[z];
602 if(v->etype != et || !typechlpfd[et]) /* funny punning */
603 for(z=0; z<BITS; z++)
604 addrs.b[z] |= bit.b[z];
607 for(z=0; z<BITS; z++)
608 consts.b[z] |= bit.b[z];
612 for(z=0; z<BITS; z++)
613 addrs.b[z] |= bit.b[z];
614 for(z=0; z<BITS; z++)
615 params.b[z] |= bit.b[z];
626 prop(Reg *r, Bits ref, Bits cal)
631 for(r1 = r; r1 != R; r1 = r1->p1) {
632 for(z=0; z<BITS; z++) {
633 ref.b[z] |= r1->refahead.b[z];
634 if(ref.b[z] != r1->refahead.b[z]) {
635 r1->refahead.b[z] = ref.b[z];
638 cal.b[z] |= r1->calahead.b[z];
639 if(cal.b[z] != r1->calahead.b[z]) {
640 r1->calahead.b[z] = cal.b[z];
644 switch(r1->prog->as) {
646 for(z=0; z<BITS; z++) {
647 cal.b[z] |= ref.b[z] | externs.b[z];
653 for(z=0; z<BITS; z++) {
660 for(z=0; z<BITS; z++) {
661 cal.b[z] = externs.b[z];
665 for(z=0; z<BITS; z++) {
666 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
667 r1->use1.b[z] | r1->use2.b[z];
668 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
669 r1->refbehind.b[z] = ref.b[z];
670 r1->calbehind.b[z] = cal.b[z];
676 for(; r != r1; r = r->p1)
677 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
678 prop(r2, r->refbehind, r->calbehind);
682 * find looping structure
684 * 1) find reverse postordering
685 * 2) find approximate dominators,
686 * the actual dominators if the flow graph is reducible
687 * otherwise, dominators plus some other non-dominators.
688 * See Matthew S. Hecht and Jeffrey D. Ullman,
689 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
690 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
691 * Oct. 1-3, 1973, pp. 207-217.
692 * 3) find all nodes with a predecessor dominated by the current node.
693 * such a node is a loop head.
694 * recursively, all preds with a greater rpo number are in the loop
697 postorder(Reg *r, Reg **rpo2r, long n)
704 n = postorder(r1, rpo2r, n);
707 n = postorder(r1, rpo2r, n);
714 rpolca(long *idom, long rpo1, long rpo2)
729 fatal(Z, "bad idom");
737 doms(long *idom, long r, long s)
745 loophead(long *idom, Reg *r)
750 if(r->p1 != R && doms(idom, src, r->p1->rpo))
752 for(r = r->p2; r != R; r = r->p2link)
753 if(doms(idom, src, r->rpo))
759 loopmark(Reg **rpo2r, long head, Reg *r)
761 if(r->rpo < head || r->active == head)
766 loopmark(rpo2r, head, r->p1);
767 for(r = r->p2; r != R; r = r->p2link)
768 loopmark(rpo2r, head, r);
772 loopit(Reg *r, long nr)
778 rpo2r = alloc(nr * sizeof(Reg*));
779 idom = alloc(nr * sizeof(long));
783 d = postorder(r, rpo2r, 0);
785 fatal(Z, "too many reg nodes");
787 for(i = 0; i < nr / 2; i++){
789 rpo2r[i] = rpo2r[nr - 1 - i];
790 rpo2r[nr - 1 - i] = r1;
792 for(i = 0; i < nr; i++)
796 for(i = 0; i < nr; i++){
800 if(r1->p1 != R && r1->p1->rpo < me)
802 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
804 d = rpolca(idom, d, r1->rpo);
808 for(i = 0; i < nr; i++){
811 if(r1->p2 != R && loophead(idom, r1))
812 loopmark(rpo2r, i, r1);
817 synch(Reg *r, Bits dif)
822 for(r1 = r; r1 != R; r1 = r1->s1) {
823 for(z=0; z<BITS; z++) {
824 dif.b[z] = (dif.b[z] &
825 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
826 r1->set.b[z] | r1->regdiff.b[z];
827 if(dif.b[z] != r1->regdiff.b[z]) {
828 r1->regdiff.b[z] = dif.b[z];
835 for(z=0; z<BITS; z++)
836 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
843 allreg(ulong b, Rgn *r)
853 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
867 if(i && r->cost >= 0) {
876 if(i && r->cost >= 0) {
886 paint1(Reg *r, int bn)
898 if(!(r->refbehind.b[z] & bb))
903 if(!(r1->refahead.b[z] & bb))
905 if(r1->act.b[z] & bb)
910 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb) {
911 change -= CLOAD * r->loop;
912 if(debug['R'] && debug['v'])
913 print("%ld%P\tld %B $%d\n", r->loop,
914 r->prog, blsh(bn), change);
920 if(r->use1.b[z] & bb) {
921 change += CREF * r->loop;
922 if(debug['R'] && debug['v'])
923 print("%ld%P\tu1 %B $%d\n", r->loop,
924 p, blsh(bn), change);
927 if((r->use2.b[z]|r->set.b[z]) & bb) {
928 change += CREF * r->loop;
929 if(debug['R'] && debug['v'])
930 print("%ld%P\tu2 %B $%d\n", r->loop,
931 p, blsh(bn), change);
934 if(STORE(r) & r->regdiff.b[z] & bb) {
935 change -= CLOAD * r->loop;
936 if(debug['R'] && debug['v'])
937 print("%ld%P\tst %B $%d\n", r->loop,
938 p, blsh(bn), change);
941 if(r->refbehind.b[z] & bb)
942 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
943 if(r1->refahead.b[z] & bb)
946 if(!(r->refahead.b[z] & bb))
950 if(r1->refbehind.b[z] & bb)
957 if(!(r->refbehind.b[z] & bb))
963 paint2(Reg *r, int bn)
972 if(!(r->act.b[z] & bb))
975 if(!(r->refbehind.b[z] & bb))
980 if(!(r1->refahead.b[z] & bb))
982 if(!(r1->act.b[z] & bb))
991 if(r->refbehind.b[z] & bb)
992 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
993 if(r1->refahead.b[z] & bb)
994 vreg |= paint2(r1, bn);
996 if(!(r->refahead.b[z] & bb))
1000 if(r1->refbehind.b[z] & bb)
1001 vreg |= paint2(r1, bn);
1005 if(!(r->act.b[z] & bb))
1007 if(!(r->refbehind.b[z] & bb))
1014 paint3(Reg *r, int bn, long rb, int rn)
1023 if(r->act.b[z] & bb)
1026 if(!(r->refbehind.b[z] & bb))
1031 if(!(r1->refahead.b[z] & bb))
1033 if(r1->act.b[z] & bb)
1038 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1039 addmove(r, bn, rn, 0);
1044 if(r->use1.b[z] & bb) {
1047 addreg(&p->from, rn);
1049 print("\t.c%P\n", p);
1051 if((r->use2.b[z]|r->set.b[z]) & bb) {
1056 print("\t.c%P\n", p);
1059 if(STORE(r) & r->regdiff.b[z] & bb)
1060 addmove(r, bn, rn, 1);
1063 if(r->refbehind.b[z] & bb)
1064 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1065 if(r1->refahead.b[z] & bb)
1066 paint3(r1, bn, rb, rn);
1068 if(!(r->refahead.b[z] & bb))
1072 if(r1->refbehind.b[z] & bb)
1073 paint3(r1, bn, rb, rn);
1077 if(r->act.b[z] & bb)
1079 if(!(r->refbehind.b[z] & bb))
1085 addreg(Adr *a, int rn)
1121 return bitno(b) + 3;
1135 if(f < 4 || f > 22 || (f&1))
1137 return 1L << (f/2 + 20);
1147 return bitno(b)*2 - 40;