10 r = alloc(sizeof(*r));
19 rcmp(const void *a1, const void *a2)
30 return p2->varno - p1->varno;
39 long initpc, val, npc;
52 regbits = RtoB(D_SP) | RtoB(D_AX) | RtoB(D_X0);
54 regbits |= RtoB(REGEXT) | RtoB(REGEXT-1);
55 for(z=0; z<BITS; z++) {
64 * build aux data structure
66 * find use and set of variables
68 val = 5L * 5L * 5L * 5L * 5L;
78 for(; p != P; p = p->link) {
116 switch(r1->prog->as) {
125 bit = mkvar(r, &p->from);
133 for(z=0; z<BITS; z++)
134 addrs.b[z] |= bit.b[z];
141 for(z=0; z<BITS; z++)
142 r->use1.b[z] |= bit.b[z];
146 bit = mkvar(r, &p->to);
150 diag(Z, "reg: unknown op: %A", p->as);
164 for(z=0; z<BITS; z++)
165 r->use2.b[z] |= bit.b[z];
204 for(z=0; z<BITS; z++)
205 r->set.b[z] |= bit.b[z];
209 * right side read+write
286 for(z=0; z<BITS; z++) {
287 r->set.b[z] |= bit.b[z];
288 r->use2.b[z] |= bit.b[z];
296 for(z=0; z<BITS; z++)
297 addrs.b[z] |= bit.b[z];
305 if(p->to.type != D_NONE)
325 r->regu |= RtoB(D_AX) | RtoB(D_DX);
333 r->regu |= RtoB(D_CX);
344 r->regu |= RtoB(D_SI) | RtoB(D_DI);
355 r->regu |= RtoB(D_AX) | RtoB(D_DI);
364 r->regu |= RtoB(D_DI) | RtoB(D_DX);
375 * turn branch references to pointers
376 * build back pointers
378 for(r = firstr; r != R; r = r->link) {
380 if(p->to.type == D_BRANCH) {
381 val = p->to.offset - initpc;
385 if(r2 != R && val >= r2->pc) {
395 diag(Z, "ref not found\n%P", p);
400 diag(Z, "ref to self\n%P", p);
410 print("\n%L %D\n", p->lineno, &p->from);
415 * find looping structure
417 for(r = firstr; r != R; r = r->link)
421 if(debug['R'] && debug['v']) {
422 print("\nlooping structure:\n");
423 for(r = firstr; r != R; r = r->link) {
424 print("%ld:%P", r->loop, r->prog);
425 for(z=0; z<BITS; z++)
426 bit.b[z] = r->use1.b[z] |
432 print(" u1=%B", r->use1);
434 print(" u2=%B", r->use2);
436 print(" st=%B", r->set);
444 * iterate propagating usage
445 * back until flow graph is complete
449 for(r = firstr; r != R; r = r->link)
451 for(r = firstr; r != R; r = r->link)
452 if(r->prog->as == ARET)
453 prop(r, zbits, zbits);
455 /* pick up unreachable code */
457 for(r = firstr; r != R; r = r1) {
459 if(r1 && r1->active && !r->active) {
460 prop(r, zbits, zbits);
472 * iterate propagating register/variable synchrony
473 * forward until graph is complete
477 for(r = firstr; r != R; r = r->link)
479 synch(firstr, zbits);
487 * calculate costs (paint1)
491 for(z=0; z<BITS; z++)
492 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
493 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
495 nearln = r->prog->lineno;
496 warn(Z, "used and not set: %B", bit);
497 if(debug['R'] && !debug['w'])
498 print("used and not set: %B\n", bit);
501 if(debug['R'] && debug['v'])
502 print("\nprop structure:\n");
503 for(r = firstr; r != R; r = r->link)
507 for(r = firstr; r != R; r = r->link) {
508 if(debug['R'] && debug['v']) {
509 print("%P\t", r->prog);
511 print("s:%B ", r->set);
512 if(bany(&r->refahead))
513 print("ra:%B ", r->refahead);
514 if(bany(&r->calahead))
515 print("ca:%B ", r->calahead);
518 for(z=0; z<BITS; z++)
519 bit.b[z] = r->set.b[z] &
520 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
522 nearln = r->prog->lineno;
523 warn(Z, "set and not used: %B", bit);
525 print("set and not used: %B\n", bit);
528 for(z=0; z<BITS; z++)
529 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
535 if(debug['R'] && debug['v'])
538 bit.b[i/32] &= ~(1L<<(i%32));
542 r->prog->lineno, change, blsh(i));
547 if(nregion >= NRGN) {
548 warn(Z, "too many regions");
555 qsort(region, nregion, sizeof(region[0]), rcmp);
559 * determine used registers (paint2)
560 * replace code (paint3)
563 for(i=0; i<nregion; i++) {
564 bit = blsh(rgp->varno);
565 vreg = paint2(rgp->enter, rgp->varno);
566 vreg = allreg(vreg, rgp);
568 print("%L$%d %R: %B\n",
569 rgp->enter->prog->lineno,
575 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
580 * peep-hole on basic block
582 if(!debug['R'] || debug['P'])
590 for(r = firstr; r != R; r = r1) {
597 for(; p != p1; p = p->link) {
619 print("addrs: %B\n", addrs);
622 for(r = firstr; r != R; r = r->link) {
624 if(p->to.type == D_BRANCH)
625 p->to.offset = r->s2->pc;
632 * free aux structures
634 for(p = firstr->prog; p != P; p = p->link){
635 while(p->link && p->link->as == ANOP)
636 p->link = p->link->link;
649 addmove(Reg *r, int bn, int rn, int f)
655 p1 = alloc(sizeof(*p1));
661 p1->lineno = p->lineno;
667 a->offset = v->offset;
672 if(v->etype == TCHAR || v->etype == TUCHAR)
674 if(v->etype == TSHORT || v->etype == TUSHORT)
676 if(v->etype == TVLONG || v->etype == TUVLONG || v->etype == TIND)
678 if(v->etype == TFLOAT)
680 if(v->etype == TDOUBLE)
688 if(v->etype == TUCHAR)
690 if(v->etype == TUSHORT)
694 print("%P\t.a%P\n", p, p1);
705 if(r >= D_AX && r <= D_R15)
708 if(r >= D_AL && r <= D_R15B)
709 b |= RtoB(r-D_AL+D_AX);
711 if(r >= D_AH && r <= D_BH)
712 b |= RtoB(r-D_AH+D_AX);
714 if(r >= D_X0 && r <= D_X0+15)
720 mkvar(Reg *r, Adr *a)
729 * mark registers used
732 r->regu |= doregbits(t);
733 r->regu |= doregbits(a->index);
741 for(z=0; z<BITS; z++)
742 addrs.b[z] |= bit.b[z];
755 if(s->name[0] == '.')
760 for(i=0; i<nvar; i++) {
768 if(debug['w'] > 1 && s)
769 warn(Z, "variable not optimized: %s", s->name);
780 print("bit=%2d et=%2d %D\n", i, et, a);
784 if(n == D_EXTERN || n == D_STATIC)
785 for(z=0; z<BITS; z++)
786 externs.b[z] |= bit.b[z];
788 for(z=0; z<BITS; z++)
789 params.b[z] |= bit.b[z];
790 if(v->etype != et || !(typechlpfd[et] || typev[et])) /* funny punning */
791 for(z=0; z<BITS; z++)
792 addrs.b[z] |= bit.b[z];
800 prop(Reg *r, Bits ref, Bits cal)
805 for(r1 = r; r1 != R; r1 = r1->p1) {
806 for(z=0; z<BITS; z++) {
807 ref.b[z] |= r1->refahead.b[z];
808 if(ref.b[z] != r1->refahead.b[z]) {
809 r1->refahead.b[z] = ref.b[z];
812 cal.b[z] |= r1->calahead.b[z];
813 if(cal.b[z] != r1->calahead.b[z]) {
814 r1->calahead.b[z] = cal.b[z];
818 switch(r1->prog->as) {
820 for(z=0; z<BITS; z++) {
821 cal.b[z] |= ref.b[z] | externs.b[z];
827 for(z=0; z<BITS; z++) {
834 for(z=0; z<BITS; z++) {
835 cal.b[z] = externs.b[z];
839 for(z=0; z<BITS; z++) {
840 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
841 r1->use1.b[z] | r1->use2.b[z];
842 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
843 r1->refbehind.b[z] = ref.b[z];
844 r1->calbehind.b[z] = cal.b[z];
850 for(; r != r1; r = r->p1)
851 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
852 prop(r2, r->refbehind, r->calbehind);
856 * find looping structure
858 * 1) find reverse postordering
859 * 2) find approximate dominators,
860 * the actual dominators if the flow graph is reducible
861 * otherwise, dominators plus some other non-dominators.
862 * See Matthew S. Hecht and Jeffrey D. Ullman,
863 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
864 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
865 * Oct. 1-3, 1973, pp. 207-217.
866 * 3) find all nodes with a predecessor dominated by the current node.
867 * such a node is a loop head.
868 * recursively, all preds with a greater rpo number are in the loop
871 postorder(Reg *r, Reg **rpo2r, long n)
878 n = postorder(r1, rpo2r, n);
881 n = postorder(r1, rpo2r, n);
888 rpolca(long *idom, long rpo1, long rpo2)
903 fatal(Z, "bad idom");
911 doms(long *idom, long r, long s)
919 loophead(long *idom, Reg *r)
924 if(r->p1 != R && doms(idom, src, r->p1->rpo))
926 for(r = r->p2; r != R; r = r->p2link)
927 if(doms(idom, src, r->rpo))
933 loopmark(Reg **rpo2r, long head, Reg *r)
935 if(r->rpo < head || r->active == head)
940 loopmark(rpo2r, head, r->p1);
941 for(r = r->p2; r != R; r = r->p2link)
942 loopmark(rpo2r, head, r);
946 loopit(Reg *r, long nr)
952 rpo2r = alloc(nr * sizeof(Reg*));
953 idom = alloc(nr * sizeof(long));
957 d = postorder(r, rpo2r, 0);
959 fatal(Z, "too many reg nodes");
961 for(i = 0; i < nr / 2; i++){
963 rpo2r[i] = rpo2r[nr - 1 - i];
964 rpo2r[nr - 1 - i] = r1;
966 for(i = 0; i < nr; i++)
970 for(i = 0; i < nr; i++){
974 if(r1->p1 != R && r1->p1->rpo < me)
976 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
978 d = rpolca(idom, d, r1->rpo);
982 for(i = 0; i < nr; i++){
985 if(r1->p2 != R && loophead(idom, r1))
986 loopmark(rpo2r, i, r1);
991 synch(Reg *r, Bits dif)
996 for(r1 = r; r1 != R; r1 = r1->s1) {
997 for(z=0; z<BITS; z++) {
998 dif.b[z] = (dif.b[z] &
999 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
1000 r1->set.b[z] | r1->regdiff.b[z];
1001 if(dif.b[z] != r1->regdiff.b[z]) {
1002 r1->regdiff.b[z] = dif.b[z];
1009 for(z=0; z<BITS; z++)
1010 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
1017 allreg(ulong b, Rgn *r)
1027 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
1043 if(i && r->cost > 0) {
1052 if(i && r->cost > 0) {
1062 paint1(Reg *r, int bn)
1071 if(r->act.b[z] & bb)
1074 if(!(r->refbehind.b[z] & bb))
1079 if(!(r1->refahead.b[z] & bb))
1081 if(r1->act.b[z] & bb)
1086 if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1087 change -= CLOAD * r->loop;
1088 if(debug['R'] && debug['v'])
1089 print("%ld%P\tld %B $%d\n", r->loop,
1090 r->prog, blsh(bn), change);
1096 if(r->use1.b[z] & bb) {
1097 change += CREF * r->loop;
1098 if(debug['R'] && debug['v'])
1099 print("%ld%P\tu1 %B $%d\n", r->loop,
1100 p, blsh(bn), change);
1103 if((r->use2.b[z]|r->set.b[z]) & bb) {
1104 change += CREF * r->loop;
1105 if(debug['R'] && debug['v'])
1106 print("%ld%P\tu2 %B $%d\n", r->loop,
1107 p, blsh(bn), change);
1110 if(STORE(r) & r->regdiff.b[z] & bb) {
1111 change -= CLOAD * r->loop;
1112 if(debug['R'] && debug['v'])
1113 print("%ld%P\tst %B $%d\n", r->loop,
1114 p, blsh(bn), change);
1117 if(r->refbehind.b[z] & bb)
1118 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1119 if(r1->refahead.b[z] & bb)
1122 if(!(r->refahead.b[z] & bb))
1126 if(r1->refbehind.b[z] & bb)
1131 if(r->act.b[z] & bb)
1133 if(!(r->refbehind.b[z] & bb))
1139 regset(Reg *r, ulong bb)
1147 while(b = bb & ~(bb-1)) {
1148 v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1150 diag(Z, "zero v.type for %#lux", b);
1151 c = copyu(r->prog, &v, A);
1160 reguse(Reg *r, ulong bb)
1168 while(b = bb & ~(bb-1)) {
1169 v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1170 c = copyu(r->prog, &v, A);
1171 if(c == 1 || c == 2 || c == 4)
1179 paint2(Reg *r, int bn)
1188 if(!(r->act.b[z] & bb))
1191 if(!(r->refbehind.b[z] & bb))
1196 if(!(r1->refahead.b[z] & bb))
1198 if(!(r1->act.b[z] & bb))
1207 if(r->refbehind.b[z] & bb)
1208 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1209 if(r1->refahead.b[z] & bb)
1210 vreg |= paint2(r1, bn);
1212 if(!(r->refahead.b[z] & bb))
1216 if(r1->refbehind.b[z] & bb)
1217 vreg |= paint2(r1, bn);
1221 if(!(r->act.b[z] & bb))
1223 if(!(r->refbehind.b[z] & bb))
1231 vreg |= reguse(r, x);
1239 paint3(Reg *r, int bn, long rb, int rn)
1248 if(r->act.b[z] & bb)
1251 if(!(r->refbehind.b[z] & bb))
1256 if(!(r1->refahead.b[z] & bb))
1258 if(r1->act.b[z] & bb)
1263 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1264 addmove(r, bn, rn, 0);
1269 if(r->use1.b[z] & bb) {
1272 addreg(&p->from, rn);
1274 print("\t.c%P\n", p);
1276 if((r->use2.b[z]|r->set.b[z]) & bb) {
1281 print("\t.c%P\n", p);
1284 if(STORE(r) & r->regdiff.b[z] & bb)
1285 addmove(r, bn, rn, 1);
1288 if(r->refbehind.b[z] & bb)
1289 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1290 if(r1->refahead.b[z] & bb)
1291 paint3(r1, bn, rb, rn);
1293 if(!(r->refahead.b[z] & bb))
1297 if(r1->refbehind.b[z] & bb)
1298 paint3(r1, bn, rb, rn);
1302 if(r->act.b[z] & bb)
1304 if(!(r->refbehind.b[z] & bb))
1310 addreg(Adr *a, int rn)
1322 if(r < D_AX || r > D_R15)
1324 return 1L << (r-D_AX);
1334 return bitno(b) + D_AX;
1346 if(f < FREGMIN || f > FREGEXT)
1348 return 1L << (f - FREGMIN + 16);
1358 return bitno(b) - 16 + FREGMIN;