12 r = alloc(sizeof(*r));
21 rcmp(const void *a1, const void *a2)
32 return p2->varno - p1->varno;
41 long initpc, val, npc;
55 for(z=0; z<BITS; z++) {
64 * build aux data structure
66 * find use and set of variables
68 val = 5L * 5L * 5L * 5L * 5L;
78 for(; p != P; p = p->link) {
116 switch(r1->prog->as) {
125 * left side always read
127 bit = mkvar(&p->from, p->as==AMOVW);
128 for(z=0; z<BITS; z++)
129 r->use1.b[z] |= bit.b[z];
132 * right side depends on opcode
134 bit = mkvar(&p->to, 0);
138 diag(Z, "reg: unknown asop: %A", p->as);
152 for(z=0; z<BITS; z++)
153 r->set.b[z] |= bit.b[z];
160 for(z=0; z<BITS; z++)
161 addrs.b[z] |= bit.b[z];
166 if(p->from.type == D_CONST)
184 * turn branch references to pointers
185 * build back pointers
187 for(r = firstr; r != R; r = r->link) {
189 if(p->to.type == D_BRANCH) {
190 val = p->to.offset - initpc;
194 if(r2 != R && val >= r2->pc) {
204 diag(Z, "ref not found\n%P", p);
209 diag(Z, "ref to self\n%P", p);
219 print("\n%L %D\n", p->lineno, &p->from);
224 * find looping structure
226 for(r = firstr; r != R; r = r->link)
233 * iterate propagating usage
234 * back until flow graph is complete
238 for(r = firstr; r != R; r = r->link)
240 for(r = firstr; r != R; r = r->link)
241 if(r->prog->as == ARET)
242 prop(r, zbits, zbits);
244 /* pick up unreachable code */
246 for(r = firstr; r != R; r = r1) {
248 if(r1 && r1->active && !r->active) {
249 prop(r, zbits, zbits);
261 * iterate propagating register/variable synchrony
262 * forward until graph is complete
266 for(r = firstr; r != R; r = r->link)
268 synch(firstr, zbits);
274 if(debug['R'] && debug['v']) {
275 print("\nprop structure:\n");
276 for(r = firstr; r != R; r = r->link) {
277 print("%ld:%P", r->loop, r->prog);
278 for(z=0; z<BITS; z++)
279 bit.b[z] = r->set.b[z] |
280 r->refahead.b[z] | r->calahead.b[z] |
281 r->refbehind.b[z] | r->calbehind.b[z] |
282 r->use1.b[z] | r->use2.b[z];
286 print(" u1=%B", r->use1);
288 print(" u2=%B", r->use2);
290 print(" st=%B", r->set);
291 if(bany(&r->refahead))
292 print(" ra=%B", r->refahead);
293 if(bany(&r->calahead))
294 print(" ca=%B", r->calahead);
295 if(bany(&r->refbehind))
296 print(" rb=%B", r->refbehind);
297 if(bany(&r->calbehind))
298 print(" cb=%B", r->calbehind);
307 * calculate costs (paint1)
311 for(z=0; z<BITS; z++)
312 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
313 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
315 nearln = r->prog->lineno;
316 warn(Z, "used and not set: %B", bit);
317 if(debug['R'] && !debug['w'])
318 print("used and not set: %B\n", bit);
322 for(r = firstr; r != R; r = r->link)
326 for(r = firstr; r != R; r = r->link) {
327 for(z=0; z<BITS; z++)
328 bit.b[z] = r->set.b[z] &
329 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
331 nearln = r->prog->lineno;
332 warn(Z, "set and not used: %B", bit);
334 print("set and not used: %B\n", bit);
337 for(z=0; z<BITS; z++)
338 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
344 if(debug['R'] && debug['v'])
347 bit.b[i/32] &= ~(1L<<(i%32));
350 print("%L $%d: %B\n",
351 r->prog->lineno, change, blsh(i));
356 if(nregion >= NRGN) {
357 warn(Z, "too many regions");
364 qsort(region, nregion, sizeof(region[0]), rcmp);
368 * determine used registers (paint2)
369 * replace code (paint3)
372 for(i=0; i<nregion; i++) {
373 bit = blsh(rgp->varno);
374 vreg = paint2(rgp->enter, rgp->varno);
375 vreg = allreg(vreg, rgp);
377 if(rgp->regno >= NREG)
378 print("%L $%d F%d: %B\n",
379 rgp->enter->prog->lineno,
384 print("%L $%d R%d: %B\n",
385 rgp->enter->prog->lineno,
391 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
396 * peep-hole on basic block
398 if(!debug['R'] || debug['P'])
406 for(r = firstr; r != R; r = r1) {
413 for(; p != p1; p = p->link) {
435 print("addrs: %B\n", addrs);
438 for(r = firstr; r != R; r = r->link) {
440 if(p->to.type == D_BRANCH)
441 p->to.offset = r->s2->pc;
448 * free aux structures
450 for(p = firstr->prog; p != P; p = p->link){
451 while(p->link && p->link->as == ANOP)
452 p->link = p->link->link;
467 for(r = firstr; r != R; r = r->link) {
470 if(r->prog->as == ABL)
472 for(r1 = r->p2; r1 != R; r1 = r1->p2link) {
475 for(z=0; z<BITS; z++)
476 bit.b[z] = r1->calbehind.b[z] &
477 (r->refahead.b[z] | r->use1.b[z] | r->use2.b[z]) &
478 ~(r->calahead.b[z] & addrs.b[z]);
481 bit.b[i/32] &= ~(1L << (i%32));
492 addmove(Reg *r, int bn, int rn, int f)
498 p1 = alloc(sizeof(*p1));
504 p1->lineno = p->lineno;
511 a->offset = v->offset;
514 if(a->etype == TARRAY || a->sym == S)
518 if(v->etype == TCHAR || v->etype == TUCHAR)
520 if(v->etype == TSHORT || v->etype == TUSHORT)
522 if(v->etype == TFLOAT)
524 if(v->etype == TDOUBLE)
527 p1->from.type = D_REG;
530 p1->from.type = D_FREG;
531 p1->from.reg = rn-NREG;
542 if(v->etype == TUCHAR)
544 if(v->etype == TUSHORT)
548 print("%P\t.a%P\n", p, p1);
552 mkvar(Adr *a, int docon)
561 if(t == D_REG && a->reg != NREG)
562 regbits |= RtoB(a->reg);
563 if(t == D_FREG && a->reg != NREG)
564 regbits |= FtoB(a->reg);
569 if(t != D_CONST || !docon || a->reg != NREG)
574 if(s == S && sval(o))
580 for(i=0; i<nvar; i++) {
588 if(s->name[0] == '.')
591 if(debug['w'] > 1 && s)
592 warn(Z, "variable not optimized: %s", s->name);
603 print("bit=%2d et=%2d %D\n", i, et, a);
606 if(n == D_EXTERN || n == D_STATIC)
607 for(z=0; z<BITS; z++)
608 externs.b[z] |= bit.b[z];
610 for(z=0; z<BITS; z++)
611 params.b[z] |= bit.b[z];
612 if(v->etype != et || !typechlpfd[et]) /* funny punning */
613 for(z=0; z<BITS; z++)
614 addrs.b[z] |= bit.b[z];
617 for(z=0; z<BITS; z++)
618 consts.b[z] |= bit.b[z];
622 for(z=0; z<BITS; z++)
623 addrs.b[z] |= bit.b[z];
624 for(z=0; z<BITS; z++)
625 params.b[z] |= bit.b[z];
636 prop(Reg *r, Bits ref, Bits cal)
641 for(r1 = r; r1 != R; r1 = r1->p1) {
642 for(z=0; z<BITS; z++) {
643 ref.b[z] |= r1->refahead.b[z];
644 if(ref.b[z] != r1->refahead.b[z]) {
645 r1->refahead.b[z] = ref.b[z];
648 cal.b[z] |= r1->calahead.b[z];
649 if(cal.b[z] != r1->calahead.b[z]) {
650 r1->calahead.b[z] = cal.b[z];
654 switch(r1->prog->as) {
656 for(z=0; z<BITS; z++) {
657 cal.b[z] |= ref.b[z] | externs.b[z];
663 for(z=0; z<BITS; z++) {
670 for(z=0; z<BITS; z++) {
671 cal.b[z] = externs.b[z];
675 for(z=0; z<BITS; z++) {
676 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
677 r1->use1.b[z] | r1->use2.b[z];
678 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
679 r1->refbehind.b[z] = ref.b[z];
680 r1->calbehind.b[z] = cal.b[z];
686 for(; r != r1; r = r->p1)
687 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
688 prop(r2, r->refbehind, r->calbehind);
692 * find looping structure
694 * 1) find reverse postordering
695 * 2) find approximate dominators,
696 * the actual dominators if the flow graph is reducible
697 * otherwise, dominators plus some other non-dominators.
698 * See Matthew S. Hecht and Jeffrey D. Ullman,
699 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
700 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
701 * Oct. 1-3, 1973, pp. 207-217.
702 * 3) find all nodes with a predecessor dominated by the current node.
703 * such a node is a loop head.
704 * recursively, all preds with a greater rpo number are in the loop
707 postorder(Reg *r, Reg **rpo2r, long n)
714 n = postorder(r1, rpo2r, n);
717 n = postorder(r1, rpo2r, n);
724 rpolca(long *idom, long rpo1, long rpo2)
739 fatal(Z, "bad idom");
747 doms(long *idom, long r, long s)
755 loophead(long *idom, Reg *r)
760 if(r->p1 != R && doms(idom, src, r->p1->rpo))
762 for(r = r->p2; r != R; r = r->p2link)
763 if(doms(idom, src, r->rpo))
769 loopmark(Reg **rpo2r, long head, Reg *r)
771 if(r->rpo < head || r->active == head)
776 loopmark(rpo2r, head, r->p1);
777 for(r = r->p2; r != R; r = r->p2link)
778 loopmark(rpo2r, head, r);
782 loopit(Reg *r, long nr)
788 rpo2r = alloc(nr * sizeof(Reg*));
789 idom = alloc(nr * sizeof(long));
793 d = postorder(r, rpo2r, 0);
795 fatal(Z, "too many reg nodes");
797 for(i = 0; i < nr / 2; i++){
799 rpo2r[i] = rpo2r[nr - 1 - i];
800 rpo2r[nr - 1 - i] = r1;
802 for(i = 0; i < nr; i++)
806 for(i = 0; i < nr; i++){
810 if(r1->p1 != R && r1->p1->rpo < me)
812 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
814 d = rpolca(idom, d, r1->rpo);
818 for(i = 0; i < nr; i++){
821 if(r1->p2 != R && loophead(idom, r1))
822 loopmark(rpo2r, i, r1);
827 synch(Reg *r, Bits dif)
832 for(r1 = r; r1 != R; r1 = r1->s1) {
833 for(z=0; z<BITS; z++) {
834 dif.b[z] = (dif.b[z] &
835 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
836 r1->set.b[z] | r1->regdiff.b[z];
837 if(dif.b[z] != r1->regdiff.b[z]) {
838 r1->regdiff.b[z] = dif.b[z];
845 for(z=0; z<BITS; z++)
846 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
853 allreg(ulong b, Rgn *r)
863 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
877 if(i && r->cost >= 0) {
887 if(i && r->cost >= 0) {
897 paint1(Reg *r, int bn)
909 if(!(r->refbehind.b[z] & bb))
914 if(!(r1->refahead.b[z] & bb))
916 if(r1->act.b[z] & bb)
921 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb) {
922 change -= CLOAD * r->loop;
923 if(debug['R'] && debug['v'])
924 print("%ld%P\tld %B $%d\n", r->loop,
925 r->prog, blsh(bn), change);
931 if(r->use1.b[z] & bb) {
932 change += CREF * r->loop;
933 if(debug['R'] && debug['v'])
934 print("%ld%P\tu1 %B $%d\n", r->loop,
935 p, blsh(bn), change);
938 if((r->use2.b[z]|r->set.b[z]) & bb) {
939 change += CREF * r->loop;
940 if(debug['R'] && debug['v'])
941 print("%ld%P\tu2 %B $%d\n", r->loop,
942 p, blsh(bn), change);
945 if(STORE(r) & r->regdiff.b[z] & bb) {
946 change -= CLOAD * r->loop;
947 if(debug['R'] && debug['v'])
948 print("%ld%P\tst %B $%d\n", r->loop,
949 p, blsh(bn), change);
952 if(r->refbehind.b[z] & bb)
953 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
954 if(r1->refahead.b[z] & bb)
957 if(!(r->refahead.b[z] & bb))
961 if(r1->refbehind.b[z] & bb)
968 if(!(r->refbehind.b[z] & bb))
974 paint2(Reg *r, int bn)
983 if(!(r->act.b[z] & bb))
986 if(!(r->refbehind.b[z] & bb))
991 if(!(r1->refahead.b[z] & bb))
993 if(!(r1->act.b[z] & bb))
1002 if(r->refbehind.b[z] & bb)
1003 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1004 if(r1->refahead.b[z] & bb)
1005 vreg |= paint2(r1, bn);
1007 if(!(r->refahead.b[z] & bb))
1011 if(r1->refbehind.b[z] & bb)
1012 vreg |= paint2(r1, bn);
1016 if(!(r->act.b[z] & bb))
1018 if(!(r->refbehind.b[z] & bb))
1025 paint3(Reg *r, int bn, long rb, int rn)
1034 if(r->act.b[z] & bb)
1037 if(!(r->refbehind.b[z] & bb))
1042 if(!(r1->refahead.b[z] & bb))
1044 if(r1->act.b[z] & bb)
1049 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1050 addmove(r, bn, rn, 0);
1055 if(r->use1.b[z] & bb) {
1058 addreg(&p->from, rn);
1060 print("\t.c%P\n", p);
1062 if((r->use2.b[z]|r->set.b[z]) & bb) {
1067 print("\t.c%P\n", p);
1070 if(STORE(r) & r->regdiff.b[z] & bb)
1071 addmove(r, bn, rn, 1);
1074 if(r->refbehind.b[z] & bb)
1075 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1076 if(r1->refahead.b[z] & bb)
1077 paint3(r1, bn, rb, rn);
1079 if(!(r->refahead.b[z] & bb))
1083 if(r1->refbehind.b[z] & bb)
1084 paint3(r1, bn, rb, rn);
1088 if(r->act.b[z] & bb)
1090 if(!(r->refbehind.b[z] & bb))
1096 addreg(Adr *a, int rn)
1120 if(r >= REGMIN && r <= REGMAX)
1145 if(f < 2 || f > NFREG-1)
1147 return 1L << (f + 16);
1157 return bitno(b) - 16;