12 r = alloc(sizeof(*r));
21 rcmp(void *a1, void *a2)
32 return p2->varno - p1->varno;
41 long initpc, val, npc;
55 for(z=0; z<BITS; z++) {
64 * build aux data structure
66 * find use and set of variables
68 val = 5L * 5L * 5L * 5L * 5L;
78 for(; p != P; p = p->link) {
115 switch(r1->prog->as) {
124 * left side always read
126 bit = mkvar(&p->from, p->as==AMOVQ || p->as==AMOVL);
127 for(z=0; z<BITS; z++)
128 r->use1.b[z] |= bit.b[z];
131 * right side depends on opcode
133 bit = mkvar(&p->to, 0);
137 diag(Z, "reg: unknown asop: %ud", p->as);
152 for(z=0; z<BITS; z++)
153 r->set.b[z] |= bit.b[z];
160 for(z=0; z<BITS; z++)
161 addrs.b[z] |= bit.b[z];
172 * turn branch references to pointers
173 * build back pointers
175 for(r = firstr; r != R; r = r->link) {
177 if(p->to.type == D_BRANCH) {
178 val = p->to.offset - initpc;
182 if(r2 != R && val >= r2->pc) {
192 diag(Z, "ref not found\n%P", p);
197 diag(Z, "ref to self\n%P", p);
207 print("\n%L %D\n", p->lineno, &p->from);
212 * find looping structure
214 for(r = firstr; r != R; r = r->link)
221 * iterate propagating usage
222 * back until flow graph is complete
226 for(r = firstr; r != R; r = r->link)
228 for(r = firstr; r != R; r = r->link)
229 if(r->prog->as == ARET)
230 prop(r, zbits, zbits);
232 /* pick up unreachable code */
234 for(r = firstr; r != R; r = r1) {
236 if(r1 && r1->active && !r->active) {
237 prop(r, zbits, zbits);
249 * iterate propagating register/variable synchrony
250 * forward until graph is complete
254 for(r = firstr; r != R; r = r->link)
256 synch(firstr, zbits);
262 if(debug['R'] && debug['v']) {
263 print("\nprop structure:\n");
264 for(r = firstr; r != R; r = r->link) {
265 print("%ld:%P", r->loop, r->prog);
266 for(z=0; z<BITS; z++)
267 bit.b[z] = r->set.b[z] |
268 r->refahead.b[z] | r->calahead.b[z] |
269 r->refbehind.b[z] | r->calbehind.b[z] |
270 r->use1.b[z] | r->use2.b[z];
274 print(" u1=%B", r->use1);
276 print(" u2=%B", r->use2);
278 print(" st=%B", r->set);
279 if(bany(&r->refahead))
280 print(" ra=%B", r->refahead);
281 if(bany(&r->calahead))
282 print(" ca=%B", r->calahead);
283 if(bany(&r->refbehind))
284 print(" rb=%B", r->refbehind);
285 if(bany(&r->calbehind))
286 print(" cb=%B", r->calbehind);
295 * calculate costs (paint1)
299 for(z=0; z<BITS; z++)
300 bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
301 ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
303 nearln = r->prog->lineno;
304 warn(Z, "used and not set: %B", bit);
305 if(debug['R'] && !debug['w'])
306 print("used and not set: %B\n", bit);
310 for(r = firstr; r != R; r = r->link)
314 for(r = firstr; r != R; r = r->link) {
315 for(z=0; z<BITS; z++)
316 bit.b[z] = r->set.b[z] &
317 ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
319 nearln = r->prog->lineno;
320 warn(Z, "set and not used: %B", bit);
322 print("set and not used: %B\n", bit);
325 for(z=0; z<BITS; z++)
326 bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
332 if(debug['R'] && debug['v'])
335 bit.b[i/32] &= ~(1L<<(i%32));
338 print("%L $%d: %B\n",
339 r->prog->lineno, change, blsh(i));
344 if(nregion >= NRGN) {
345 warn(Z, "too many regions");
352 qsort(region, nregion, sizeof(region[0]), rcmp);
356 * determine used registers (paint2)
357 * replace code (paint3)
360 for(i=0; i<nregion; i++) {
361 bit = blsh(rgp->varno);
362 vreg = paint2(rgp->enter, rgp->varno);
363 vreg = allreg(vreg, rgp);
365 if(rgp->regno >= NREG)
366 print("%L $%d F%d: %B\n",
367 rgp->enter->prog->lineno,
372 print("%L $%d R%d: %B\n",
373 rgp->enter->prog->lineno,
379 paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
384 * peep-hole on basic block
386 if(!debug['R'] || debug['P'])
394 for(r = firstr; r != R; r = r1) {
401 for(; p != p1; p = p->link) {
422 print("addrs: %B\n", addrs);
425 for(r = firstr; r != R; r = r->link) {
427 if(p->to.type == D_BRANCH)
428 p->to.offset = r->s2->pc;
435 * free aux structures
437 for(p = firstr->prog; p != P; p = p->link){
438 while(p->link && p->link->as == ANOP)
439 p->link = p->link->link;
454 for(r = firstr; r != R; r = r->link) {
457 if(r->prog->as == AJSR)
459 for(r1 = r->p2; r1 != R; r1 = r1->p2link) {
462 for(z=0; z<BITS; z++)
463 bit.b[z] = r1->calbehind.b[z] &
464 (r->refahead.b[z] | r->use1.b[z] | r->use2.b[z]) &
465 ~(r->calahead.b[z] & addrs.b[z]);
468 bit.b[i/32] &= ~(1L << (i%32));
479 addmove(Reg *r, int bn, int rn, int f)
485 p1 = alloc(sizeof(*p1));
491 p1->lineno = p->lineno;
498 a->offset = v->offset;
501 if(a->etype == TARRAY || a->sym == S)
505 if(v->etype == TCHAR || v->etype == TUCHAR) /* TODO: optimise locals? */
507 if(v->etype == TSHORT || v->etype == TUSHORT)
509 if(v->etype == TVLONG || v->etype == TUVLONG)
511 if(v->etype == TFLOAT)
513 if(v->etype == TDOUBLE)
516 p1->from.type = D_REG;
519 p1->from.type = D_FREG;
520 p1->from.reg = rn-NREG;
531 if(v->etype == TUCHAR)
533 if(v->etype == TUSHORT)
537 print("%P\t.a%P\n", p, p1);
541 mkvar(Adr *a, int docon)
550 if(t == D_REG && a->reg != NREG)
551 regbits |= RtoB(a->reg);
552 if(t == D_FREG && a->reg != NREG)
553 regbits |= FtoB(a->reg);
558 if(t != D_CONST || !docon || a->reg != NREG)
561 if (o > 0x7FFFFFFFLL || o < -0x80000000LL)
565 if(s == S && sval(o))
571 for(i=0; i<nvar; i++) {
579 if(s->name[0] == '.')
582 if(debug['w'] > 1 && s)
583 warn(Z, "variable not optimized: %s", s->name);
594 print("bit=%2d et=%2d %D\n", i, et, a);
597 if(n == D_EXTERN || n == D_STATIC)
598 for(z=0; z<BITS; z++)
599 externs.b[z] |= bit.b[z];
601 for(z=0; z<BITS; z++)
602 params.b[z] |= bit.b[z];
603 if(v->etype != et || (!typechlpfd[et] && !typev[et])) /* funny punning? */
604 for(z=0; z<BITS; z++)
605 addrs.b[z] |= bit.b[z];
608 for(z=0; z<BITS; z++)
609 consts.b[z] |= bit.b[z];
613 for(z=0; z<BITS; z++)
614 addrs.b[z] |= bit.b[z];
615 for(z=0; z<BITS; z++)
616 params.b[z] |= bit.b[z];
627 prop(Reg *r, Bits ref, Bits cal)
632 for(r1 = r; r1 != R; r1 = r1->p1) {
633 for(z=0; z<BITS; z++) {
634 ref.b[z] |= r1->refahead.b[z];
635 if(ref.b[z] != r1->refahead.b[z]) {
636 r1->refahead.b[z] = ref.b[z];
639 cal.b[z] |= r1->calahead.b[z];
640 if(cal.b[z] != r1->calahead.b[z]) {
641 r1->calahead.b[z] = cal.b[z];
645 switch(r1->prog->as) {
647 for(z=0; z<BITS; z++) {
648 cal.b[z] |= ref.b[z] | externs.b[z];
654 for(z=0; z<BITS; z++) {
661 for(z=0; z<BITS; z++) {
662 cal.b[z] = externs.b[z];
666 for(z=0; z<BITS; z++) {
667 ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
668 r1->use1.b[z] | r1->use2.b[z];
669 cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
670 r1->refbehind.b[z] = ref.b[z];
671 r1->calbehind.b[z] = cal.b[z];
677 for(; r != r1; r = r->p1)
678 for(r2 = r->p2; r2 != R; r2 = r2->p2link)
679 prop(r2, r->refbehind, r->calbehind);
683 * find looping structure
685 * 1) find reverse postordering
686 * 2) find approximate dominators,
687 * the actual dominators if the flow graph is reducible
688 * otherwise, dominators plus some other non-dominators.
689 * See Matthew S. Hecht and Jeffrey D. Ullman,
690 * "Analysis of a Simple Algorithm for Global Data Flow Problems",
691 * Conf. Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
692 * Oct. 1-3, 1973, pp. 207-217.
693 * 3) find all nodes with a predecessor dominated by the current node.
694 * such a node is a loop head.
695 * recursively, all preds with a greater rpo number are in the loop
698 postorder(Reg *r, Reg **rpo2r, long n)
705 n = postorder(r1, rpo2r, n);
708 n = postorder(r1, rpo2r, n);
715 rpolca(long *idom, long rpo1, long rpo2)
730 fatal(Z, "bad idom");
738 doms(long *idom, long r, long s)
746 loophead(long *idom, Reg *r)
751 if(r->p1 != R && doms(idom, src, r->p1->rpo))
753 for(r = r->p2; r != R; r = r->p2link)
754 if(doms(idom, src, r->rpo))
760 loopmark(Reg **rpo2r, long head, Reg *r)
762 if(r->rpo < head || r->active == head)
767 loopmark(rpo2r, head, r->p1);
768 for(r = r->p2; r != R; r = r->p2link)
769 loopmark(rpo2r, head, r);
773 loopit(Reg *r, long nr)
779 rpo2r = alloc(nr * sizeof(Reg*));
780 idom = alloc(nr * sizeof(long));
784 d = postorder(r, rpo2r, 0);
786 fatal(Z, "too many reg nodes");
788 for(i = 0; i < nr / 2; i++){
790 rpo2r[i] = rpo2r[nr - 1 - i];
791 rpo2r[nr - 1 - i] = r1;
793 for(i = 0; i < nr; i++)
797 for(i = 0; i < nr; i++){
801 if(r1->p1 != R && r1->p1->rpo < me)
803 for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
805 d = rpolca(idom, d, r1->rpo);
809 for(i = 0; i < nr; i++){
812 if(r1->p2 != R && loophead(idom, r1))
813 loopmark(rpo2r, i, r1);
818 synch(Reg *r, Bits dif)
823 for(r1 = r; r1 != R; r1 = r1->s1) {
824 for(z=0; z<BITS; z++) {
825 dif.b[z] = (dif.b[z] &
826 ~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
827 r1->set.b[z] | r1->regdiff.b[z];
828 if(dif.b[z] != r1->regdiff.b[z]) {
829 r1->regdiff.b[z] = dif.b[z];
836 for(z=0; z<BITS; z++)
837 dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
844 allreg(ulong b, Rgn *r)
854 diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
870 if(i && r->cost >= 0) {
879 if(i && r->cost >= 0) {
889 paint1(Reg *r, int bn)
901 if(!(r->refbehind.b[z] & bb))
906 if(!(r1->refahead.b[z] & bb))
908 if(r1->act.b[z] & bb)
913 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb) {
914 change -= CLOAD * r->loop;
915 if(debug['R'] && debug['v'])
916 print("%ld%P\tld %B $%d\n", r->loop,
917 r->prog, blsh(bn), change);
923 if(r->use1.b[z] & bb) {
924 change += CREF * r->loop;
925 if(debug['R'] && debug['v'])
926 print("%ld%P\tu1 %B $%d\n", r->loop,
927 p, blsh(bn), change);
930 if((r->use2.b[z]|r->set.b[z]) & bb) {
931 change += CREF * r->loop;
932 if(debug['R'] && debug['v'])
933 print("%ld%P\tu2 %B $%d\n", r->loop,
934 p, blsh(bn), change);
937 if(STORE(r) & r->regdiff.b[z] & bb) {
938 change -= CLOAD * r->loop;
939 if(debug['R'] && debug['v'])
940 print("%ld%P\tst %B $%d\n", r->loop,
941 p, blsh(bn), change);
944 if(r->refbehind.b[z] & bb)
945 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
946 if(r1->refahead.b[z] & bb)
949 if(!(r->refahead.b[z] & bb))
953 if(r1->refbehind.b[z] & bb)
960 if(!(r->refbehind.b[z] & bb))
966 paint2(Reg *r, int bn)
975 if(!(r->act.b[z] & bb))
978 if(!(r->refbehind.b[z] & bb))
983 if(!(r1->refahead.b[z] & bb))
985 if(!(r1->act.b[z] & bb))
994 if(r->refbehind.b[z] & bb)
995 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
996 if(r1->refahead.b[z] & bb)
997 vreg |= paint2(r1, bn);
999 if(!(r->refahead.b[z] & bb))
1003 if(r1->refbehind.b[z] & bb)
1004 vreg |= paint2(r1, bn);
1008 if(!(r->act.b[z] & bb))
1010 if(!(r->refbehind.b[z] & bb))
1017 paint3(Reg *r, int bn, long rb, int rn)
1026 if(r->act.b[z] & bb)
1029 if(!(r->refbehind.b[z] & bb))
1034 if(!(r1->refahead.b[z] & bb))
1036 if(r1->act.b[z] & bb)
1041 if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1042 addmove(r, bn, rn, 0);
1047 if(r->use1.b[z] & bb) {
1050 addreg(&p->from, rn);
1051 if(p->as == AMOVL && typechlp[var[bn].etype])
1054 print("\t.c%P\n", p);
1056 if((r->use2.b[z]|r->set.b[z]) & bb) {
1060 if(p->as == AMOVL && typechlp[var[bn].etype])
1063 print("\t.c%P\n", p);
1066 if(STORE(r) & r->regdiff.b[z] & bb)
1067 addmove(r, bn, rn, 1);
1070 if(r->refbehind.b[z] & bb)
1071 for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1072 if(r1->refahead.b[z] & bb)
1073 paint3(r1, bn, rb, rn);
1075 if(!(r->refahead.b[z] & bb))
1079 if(r1->refbehind.b[z] & bb)
1080 paint3(r1, bn, rb, rn);
1084 if(r->act.b[z] & bb)
1086 if(!(r->refbehind.b[z] & bb))
1092 addreg(Adr *a, int rn)
1120 if(r >= 2 && r <= 12)
1122 if(r >= 16 && r <= 25)
1153 return 1L << (f + 20);
1163 return bitno(b) - 20;