10 r = alloc(sizeof(*r));
19 rcmp(const void *a1, const void *a2)
34 return p2->varno - p1->varno;
43 long val, initpc, npc;
56 for(z=0; z<BITS; z++) {
61 regbits = RtoB(0) | /* return reg */
62 AtoB(6) | AtoB(7) | /* sp and sb */
63 FtoB(0) | FtoB(1); /* floating return reg */
64 for(i=0; i<NREG; i++) {
75 * build aux data structure
77 * find use and set of variables
79 val = 5L * 5L * 5L * 5L * 5L;
89 for(; p != P; p = p->link) {
127 switch(r1->prog->as) {
135 bit = mkvar(&p->from, AGOK);
139 if(!(mvbits & B_INDIR))
140 for(z=0; z<BITS; z++)
141 addrs.b[z] |= bit.b[z];
145 for(z=0; z<BITS; z++)
146 addrs.b[z] |= bit.b[z];
147 for(z=0; z<BITS; z++)
148 r->use1.b[z] |= bit.b[z];
151 bit = mkvar(&p->to, p->as);
154 case ABSR: /* funny */
155 for(z=0; z<BITS; z++)
156 addrs.b[z] |= bit.b[z];
160 if(!(mvbits & B_INDIR))
161 for(z=0; z<BITS; z++)
162 addrs.b[z] |= bit.b[z];
165 case ACMPB: case ACMPW: case ACMPL:
166 case AFCMPF: case AFCMPD:
167 case ATSTB: case ATSTW: case ATSTL:
168 case AFTSTF: case AFTSTD:
169 case ABFEXTU: case ABFEXTS:
171 for(z=0; z<BITS; z++)
172 addrs.b[z] |= bit.b[z];
173 for(z=0; z<BITS; z++)
174 r->use2.b[z] |= bit.b[z];
178 diag(Z, "reg: unknown asop: %A", p->as);
180 case AADDB: case AADDW: case AADDL:
181 case ASUBB: case ASUBW: case ASUBL:
182 case AANDB: case AANDW: case AANDL:
183 case AORB: case AORW: case AORL:
184 case AEORB: case AEORW: case AEORL:
186 for(z=0; z<BITS; z++)
187 r->use2.b[z] |= bit.b[z];
190 case AMOVB: case AMOVW: case AMOVL:
191 case AFMOVEB: case AFMOVEW: case AFMOVEL:
192 case ACLRB: case ACLRW: case ACLRL:
193 case AFMOVEF: case AFMOVED:
195 for(z=0; z<BITS; z++)
196 r->use2.b[z] |= bit.b[z];
198 for(z=0; z<BITS; z++)
199 r->set.b[z] |= bit.b[z];
211 * turn branch references to pointers
212 * build back pointers
214 for(r = firstr; r != R; r = r->link) {
216 if(p->to.type == D_BRANCH) {
217 val = p->to.offset - initpc;
221 if(r2 != R && val >= r2->pc) {
230 diag(Z, "ref not found\n%L:%P", p->lineno, p);
234 diag(Z, "ref to self");
243 print("\n%L %D\n", firstr->prog->lineno, &firstr->prog->from);
247 * find looping structure
249 for(r = firstr; r != R; r = r->link)
253 if(debug['R'] && debug['v']) {
254 print("\nlooping structure:\n");
255 for(r = firstr; r != R; r = r->link) {
256 print("%ld:%P", r->loop, r->prog);
257 for(z=0; z<BITS; z++)
258 bit.b[z] = r->use1.b[z] |
259 r->use2.b[z] | r->set.b[z];
263 print(" u1=%B", r->use1);
265 print(" u2=%B", r->use2);
267 print(" st=%B", r->set);
275 * iterate propagating usage
276 * back until flow graph is complete
280 for(r = firstr; r != R; r = r->link)
282 for(r = firstr; r != R; r = r->link)
283 if(r->prog->as == ARTS)
284 prop(r, zbits, zbits);
286 /* pick up unreachable code */
288 for(r = firstr; r != R; r = r1) {
290 if(r1 && r1->active && !r->active) {
291 prop(r, zbits, zbits);
302 * iterate propagating register/variable synchrony
303 * forward until graph is complete
307 for(r = firstr; r != R; r = r->link)
309 synch(firstr, zbits);
317 * calculate costs (paint1)
321 for(z=0; z<BITS; z++)
322 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
323 ~(externs.b[z] | params.b[z] | addrs.b[z]);
325 nearln = r->prog->lineno;
326 warn(Z, "used and not set: %B", bit);
327 if(debug['R'] && !debug['w'])
328 print("used and not set: %B\n", bit);
332 * load of a denormalized fp will trap
336 bit.b[i/32] &= ~(1L << (i%32));
338 if(v->type == D_AUTO) {
339 r->set.b[i/32] |= (1L << (i%32));
341 addmove(r, i, NREG+NREG, 1);
346 if(debug['R'] && debug['v'])
347 print("\nprop structure:\n");
348 for(r = firstr; r != R; r = r->link) {
349 if(debug['R'] && debug['v'])
350 print("%P\n set = %B; rah = %B; cal = %B\n",
351 r->prog, r->set, r->refahead, r->calahead);
356 for(r = firstr; r != R; r = r->link) {
357 for(z=0; z<BITS; z++)
358 bit.b[z] = r->set.b[z] &
359 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
361 nearln = r->prog->lineno;
362 warn(Z, "set and not used: %B", bit);
364 print("set an not used: %B\n", bit);
367 for(z=0; z<BITS; z++)
368 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
375 if(debug['R'] && debug['v'])
378 bit.b[i/32] &= ~(1L<<(i%32));
379 if(changer <= 0 && changea <= 0) {
381 print("%L$%d.%d: %B\n",
383 changer, changea, blsh(i));
386 rgp->costr = changer;
387 rgp->costa = changea;
389 if(nregion >= NRGN) {
390 warn(Z, "too many regions");
397 qsort(region, nregion, sizeof(region[0]), rcmp);
401 * determine used registers (paint2)
402 * replace code (paint3)
405 for(i=0; i<nregion; i++) {
406 bit = blsh(rgp->varno);
407 vreg = paint2(rgp->enter, rgp->varno);
408 vreg = allreg(vreg, rgp);
410 print("%L$%d.%d %R: %B\n",
411 rgp->enter->prog->lineno,
412 rgp->costr, rgp->costa,
415 if(rgp->regno != D_NONE)
416 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
421 * peep-hole on basic block
423 if(!debug['R'] || debug['P'])
431 for(r = firstr; r != R; r = r1) {
438 for(; p != p1; p = p->link) {
460 print("addrs: %B\n", addrs);
463 for(r = firstr; r != R; r = r->link) {
465 if(p->to.type == D_BRANCH)
466 p->to.offset = r->s2->pc;
473 * free aux structures
475 for(p = firstr->prog; p != P; p = p->link){
476 while(p->link && p->link->as == ANOP)
477 p->link = p->link->link;
490 addmove(Reg *r, int bn, int rn, int f)
502 if(p1->from.type == D_CCR)
521 p1->lineno = p->lineno;
523 p1->from.type = D_CCR;
532 p1->lineno = p->lineno;
535 p1->from.sym = v->sym;
536 p1->from.type = v->type;
537 p1->from.offset = v->offset;
538 p1->from.etype = v->etype;
542 p1->from = zprog.from;
545 p1->as = opxt[OAS][v->etype];
551 p1->lineno = p->lineno;
553 p1->from.type = D_TOS;
558 print("%P\t.a%P\n", p, p1);
562 mkvar(Adr *a, int as)
571 t = a->type & D_MASK;
575 if(t >= D_R0 && t < D_R0+NREG) {
576 regbits |= RtoB(t-D_R0);
577 if(as == ADIVUL || as == ADIVSL)
578 regbits |= RtoB(t-D_R0+1);
580 if(t >= D_A0 && t < D_A0+NREG)
581 regbits |= AtoB(t-D_A0);
582 if(t >= D_F0 && t < D_F0+NREG)
583 regbits |= FtoB(t-D_F0);
596 if((a->type & I_MASK) == I_ADDR)
599 switch(a->index & I_MASK) {
612 for(i=0; i<nvar; i++) {
620 if(s->name[0] == '.')
623 if(debug['w'] > 1 && s)
624 warn(Z, "variable not optimized: %s", s->name);
635 print("bit=%2d et=%2d %s (%p,%d,%ld)\n",
636 i, a->etype, s->name,
637 v->sym, v->type, v->offset);
641 if(t == D_EXTERN || t == D_STATIC)
642 for(z=0; z<BITS; z++)
643 externs.b[z] |= bit.b[z];
645 for(z=0; z<BITS; z++)
646 params.b[z] |= bit.b[z];
647 if(a->etype != v->etype || !typechlpfd[a->etype])
648 for(z=0; z<BITS; z++)
649 addrs.b[z] |= bit.b[z]; /* funny punning */
657 prop(Reg *r, Bits ref, Bits cal)
662 for(r1 = r; r1 != R; r1 = r1->p1) {
663 for(z=0; z<BITS; z++) {
664 ref.b[z] |= r1->refahead.b[z];
665 if(ref.b[z] != r1->refahead.b[z]) {
666 r1->refahead.b[z] = ref.b[z];
669 cal.b[z] |= r1->calahead.b[z];
670 if(cal.b[z] != r1->calahead.b[z]) {
671 r1->calahead.b[z] = cal.b[z];
675 switch(r1->prog->as) {
677 for(z=0; z<BITS; z++) {
678 cal.b[z] |= ref.b[z] | externs.b[z];
684 for(z=0; z<BITS; z++) {
691 for(z=0; z<BITS; z++) {
692 cal.b[z] = externs.b[z];
696 for(z=0; z<BITS; z++) {
697 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
698 r1->use1.b[z] | r1->use2.b[z];
699 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
700 r1->refbehind.b[z] = ref.b[z];
701 r1->calbehind.b[z] = cal.b[z];
707 for(; r != r1; r = r->p1)
708 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
709 prop(r2, r->refbehind, r->calbehind);
713 * find looping structure
715 * 1) find reverse postordering
716 * 2) find approximate dominators,
717 * the actual dominators if the flow graph is reducible
718 * otherwise, dominators plus some other non-dominators.
719 * See Matthew S. Hecht and Jeffrey D. Ullman,
720 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
721 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
722 * Oct. 1-3, 1973, pp. 207-217.
723 * 3) find all nodes with a predecessor dominated by the current node.
724 * such a node is a loop head.
725 * recursively, all preds with a greater rpo number are in the loop
728 postorder(Reg *r, Reg **rpo2r, long n)
735 n = postorder(r1, rpo2r, n);
738 n = postorder(r1, rpo2r, n);
745 rpolca(long *idom, long rpo1, long rpo2)
760 fatal(Z, "bad idom");
768 doms(long *idom, long r, long s)
776 loophead(long *idom, Reg *r)
781 if(r->p1 != R && doms(idom, src, r->p1->rpo))
783 for(r = r->p2; r != R; r = r->p2link)
784 if(doms(idom, src, r->rpo))
790 loopmark(Reg **rpo2r, long head, Reg *r)
792 if(r->rpo < head || r->active == head)
797 loopmark(rpo2r, head, r->p1);
798 for(r = r->p2; r != R; r = r->p2link)
799 loopmark(rpo2r, head, r);
803 loopit(Reg *r, long nr)
809 rpo2r = alloc(nr * sizeof(Reg*));
810 idom = alloc(nr * sizeof(long));
814 d = postorder(r, rpo2r, 0);
816 fatal(Z, "too many reg nodes");
818 for(i = 0; i < nr / 2; i++){
820 rpo2r[i] = rpo2r[nr - 1 - i];
821 rpo2r[nr - 1 - i] = r1;
823 for(i = 0; i < nr; i++)
827 for(i = 0; i < nr; i++){
831 if(r1->p1 != R && r1->p1->rpo < me)
833 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
835 d = rpolca(idom, d, r1->rpo);
839 for(i = 0; i < nr; i++){
842 if(r1->p2 != R && loophead(idom, r1))
843 loopmark(rpo2r, i, r1);
848 synch(Reg *r, Bits dif)
853 for(r1 = r; r1 != R; r1 = r1->s1) {
854 for(z=0; z<BITS; z++) {
855 dif.b[z] = (dif.b[z] &
856 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
857 r1->set.b[z] | r1->regdiff.b[z];
858 if(dif.b[z] != r1->regdiff.b[z]) {
859 r1->regdiff.b[z] = dif.b[z];
866 for(z=0; z<BITS; z++)
867 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
874 allreg(ulong b, Rgn *r)
884 diag(Z, "unknown etype");
898 if(r->costa == r->costr)
901 if(j < NREG && r->costa > 0)
902 if(r->costa > r->costr || i >= NREG) {
906 if(i < NREG && r->costr > 0) {
925 paint1(Reg *r, int bn)
938 if(!(r->refbehind.b[z] & bb))
943 if(!(r1->refahead.b[z] & bb))
945 if(r1->act.b[z] & bb)
949 if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
950 changer -= CLOAD * r->loop;
951 changea -= CLOAD * r->loop;
952 if(debug['R'] && debug['v'])
953 print("%ld%P\tld %B $%d.%d\n", r->loop,
954 r->prog, blsh(bn), changer, changea);
960 if(r->use1.b[z] & bb) {
961 changer += CREF * r->loop;
962 changea += CREF * r->loop;
975 changer += (CXREF-CREF) * r->loop;
976 if(x != (I_INDEX3|D_NONE))
978 if((x&I_MASK) == I_INDEX1)
983 if(x >= D_R0 && x < D_R0+NREG)
985 if(x >= D_A0 && x < D_A0+NREG)
988 if(debug['R'] && debug['v'])
989 print("%ld%P\tu1 %B $%d.%d\n", r->loop,
990 p, blsh(bn), changer, changea);
992 if((r->use2.b[z]|r->set.b[z]) & bb) {
993 changer += CREF * r->loop;
994 changea += CREF * r->loop;
1005 case ACLRL: /* can be faked */
1006 case ATSTL: /* can be faked */
1010 changer += (CXREF-CREF) * r->loop;
1011 if(x != (I_INDEX3|D_NONE))
1013 if((x&I_MASK) == I_INDEX1)
1016 if(p->as == AMOVL) {
1018 if(x >= D_R0 && x < D_R0+NREG)
1020 if(x >= D_A0 && x < D_A0+NREG)
1023 if(debug['R'] && debug['v'])
1024 print("%ld%P\tu2 %B $%d.%d\n", r->loop,
1025 p, blsh(bn), changer, changea);
1027 if(STORE(r) & r->regdiff.b[z] & bb) {
1028 changer -= CLOAD * r->loop;
1029 changea -= CLOAD * r->loop;
1030 if(debug['R'] && debug['v'])
1031 print("%ld%P\tst %B $%d.%d\n", r->loop,
1032 p, blsh(bn), changer, changea);
1035 if(r->refbehind.b[z] & bb)
1036 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1037 if(r1->refahead.b[z] & bb)
1040 if(!(r->refahead.b[z] & bb))
1044 if(r1->refbehind.b[z] & bb)
1049 if(r->act.b[z] & bb)
1051 if(!(r->refbehind.b[z] & bb))
1057 paint2(Reg *r, int bn)
1066 if(!(r->act.b[z] & bb))
1069 if(!(r->refbehind.b[z] & bb))
1074 if(!(r1->refahead.b[z] & bb))
1076 if(!(r1->act.b[z] & bb))
1085 if(r->refbehind.b[z] & bb)
1086 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1087 if(r1->refahead.b[z] & bb)
1088 vreg |= paint2(r1, bn);
1090 if(!(r->refahead.b[z] & bb))
1094 if(r1->refbehind.b[z] & bb)
1095 vreg |= paint2(r1, bn);
1099 if(!(r->act.b[z] & bb))
1101 if(!(r->refbehind.b[z] & bb))
1108 paint3(Reg *r, int bn, ulong rb, int rn)
1117 if(r->act.b[z] & bb)
1120 if(!(r->refbehind.b[z] & bb))
1125 if(!(r1->refahead.b[z] & bb))
1127 if(r1->act.b[z] & bb)
1131 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1132 addmove(r, bn, rn, 0);
1137 if(r->use1.b[z] & bb) {
1140 addreg(&p->from, rn);
1142 print("\t.c%P\n", p);
1144 if((r->use2.b[z]|r->set.b[z]) & bb) {
1149 print("\t.c%P\n", p);
1151 if(STORE(r) & r->regdiff.b[z] & bb)
1152 addmove(r, bn, rn, 1);
1155 if(r->refbehind.b[z] & bb)
1156 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1157 if(r1->refahead.b[z] & bb)
1158 paint3(r1, bn, rb, rn);
1160 if(!(r->refahead.b[z] & bb))
1164 if(r1->refbehind.b[z] & bb)
1165 paint3(r1, bn, rb, rn);
1169 if(r->act.b[z] & bb)
1171 if(!(r->refbehind.b[z] & bb))
1177 addreg(Adr *a, int rn)
1183 if(rn >= D_R0 && rn < D_R0+NREG)
1185 if(x == (I_INDEX3|D_NONE)) {
1186 a->type = rn | I_INDIR;
1188 a->offset = a->displace;
1193 a->type = rn | I_INDIR;
1194 a->index += I_INDEX1 - I_INDEX2;
1195 a->offset = a->displace;
1199 a->type = rn | (a->type & I_INDIR);
1203 if(x == (I_INDEX3|D_NONE)) {
1204 a->type = D_NONE|I_INDIR;
1205 a->index += I_INDEX1 + rn - D_NONE - I_INDEX3;
1206 a->scale = 4; /* .L*1 */
1207 a->offset = a->displace;
1211 a->type = rn | (a->type & I_INDIR);
1224 if(r < 0 || r >= NREG)
1226 return 1L << (r + 0);
1236 return bitno(b) - 0;
1243 if(a < 0 || a >= NREG)
1245 return 1L << (a + NREG);
1255 return bitno(b) - NREG;
1262 if(f < 0 || f >= NREG)
1264 return 1L << (f + NREG+NREG);
1274 return bitno(b) - NREG-NREG;