xref: /inferno-os/utils/8c/reg.c (revision 2b69dba5038ffd0b59cf30a4c44bce549e5097f8)
1 #include "gc.h"
2 
3 Reg*
4 rega(void)
5 {
6 	Reg *r;
7 
8 	r = freer;
9 	if(r == R) {
10 		r = alloc(sizeof(*r));
11 	} else
12 		freer = r->link;
13 
14 	*r = zreg;
15 	return r;
16 }
17 
18 int
19 rcmp(const void *a1, const void *a2)
20 {
21 	Rgn *p1, *p2;
22 	int c1, c2;
23 
24 	p1 = (Rgn*)a1;
25 	p2 = (Rgn*)a2;
26 	c1 = p2->cost;
27 	c2 = p1->cost;
28 	if(c1 -= c2)
29 		return c1;
30 	return p2->varno - p1->varno;
31 }
32 
33 void
34 regopt(Prog *p)
35 {
36 	Reg *r, *r1, *r2;
37 	Prog *p1;
38 	int i, z;
39 	long initpc, val, npc;
40 	ulong vreg;
41 	Bits bit;
42 	struct
43 	{
44 		long	m;
45 		long	c;
46 		Reg*	p;
47 	} log5[6], *lp;
48 
49 	firstr = R;
50 	lastr = R;
51 	nvar = 0;
52 	regbits = RtoB(D_SP) | RtoB(D_AX);
53 	for(z=0; z<BITS; z++) {
54 		externs.b[z] = 0;
55 		params.b[z] = 0;
56 		consts.b[z] = 0;
57 		addrs.b[z] = 0;
58 	}
59 
60 	/*
61 	 * pass 1
62 	 * build aux data structure
63 	 * allocate pcs
64 	 * find use and set of variables
65 	 */
66 	val = 5L * 5L * 5L * 5L * 5L;
67 	lp = log5;
68 	for(i=0; i<5; i++) {
69 		lp->m = val;
70 		lp->c = 0;
71 		lp->p = R;
72 		val /= 5L;
73 		lp++;
74 	}
75 	val = 0;
76 	for(; p != P; p = p->link) {
77 		switch(p->as) {
78 		case ADATA:
79 		case AGLOBL:
80 		case ANAME:
81 		case ASIGNAME:
82 			continue;
83 		}
84 		r = rega();
85 		if(firstr == R) {
86 			firstr = r;
87 			lastr = r;
88 		} else {
89 			lastr->link = r;
90 			r->p1 = lastr;
91 			lastr->s1 = r;
92 			lastr = r;
93 		}
94 		r->prog = p;
95 		r->pc = val;
96 		val++;
97 
98 		lp = log5;
99 		for(i=0; i<5; i++) {
100 			lp->c--;
101 			if(lp->c <= 0) {
102 				lp->c = lp->m;
103 				if(lp->p != R)
104 					lp->p->log5 = r;
105 				lp->p = r;
106 				(lp+1)->c = 0;
107 				break;
108 			}
109 			lp++;
110 		}
111 
112 		r1 = r->p1;
113 		if(r1 != R)
114 		switch(r1->prog->as) {
115 		case ARET:
116 		case AJMP:
117 		case AIRETL:
118 			r->p1 = R;
119 			r1->s1 = R;
120 		}
121 
122 		bit = mkvar(r, &p->from);
123 		if(bany(&bit))
124 		switch(p->as) {
125 		/*
126 		 * funny
127 		 */
128 		case ALEAL:
129 			for(z=0; z<BITS; z++)
130 				addrs.b[z] |= bit.b[z];
131 			break;
132 
133 		/*
134 		 * left side read
135 		 */
136 		default:
137 			for(z=0; z<BITS; z++)
138 				r->use1.b[z] |= bit.b[z];
139 			break;
140 		}
141 
142 		bit = mkvar(r, &p->to);
143 		if(bany(&bit))
144 		switch(p->as) {
145 		default:
146 			diag(Z, "reg: unknown op: %A", p->as);
147 			break;
148 
149 		/*
150 		 * right side read
151 		 */
152 		case ACMPB:
153 		case ACMPL:
154 		case ACMPW:
155 			for(z=0; z<BITS; z++)
156 				r->use2.b[z] |= bit.b[z];
157 			break;
158 
159 		/*
160 		 * right side write
161 		 */
162 		case ANOP:
163 		case AMOVL:
164 		case AMOVB:
165 		case AMOVW:
166 		case AMOVBLSX:
167 		case AMOVBLZX:
168 		case AMOVWLSX:
169 		case AMOVWLZX:
170 			for(z=0; z<BITS; z++)
171 				r->set.b[z] |= bit.b[z];
172 			break;
173 
174 		/*
175 		 * right side read+write
176 		 */
177 		case AADDB:
178 		case AADDL:
179 		case AADDW:
180 		case AANDB:
181 		case AANDL:
182 		case AANDW:
183 		case ASUBB:
184 		case ASUBL:
185 		case ASUBW:
186 		case AORB:
187 		case AORL:
188 		case AORW:
189 		case AXORB:
190 		case AXORL:
191 		case AXORW:
192 		case ASALB:
193 		case ASALL:
194 		case ASALW:
195 		case ASARB:
196 		case ASARL:
197 		case ASARW:
198 		case AROLB:
199 		case AROLL:
200 		case AROLW:
201 		case ARORB:
202 		case ARORL:
203 		case ARORW:
204 		case ASHLB:
205 		case ASHLL:
206 		case ASHLW:
207 		case ASHRB:
208 		case ASHRL:
209 		case ASHRW:
210 		case AIMULL:
211 		case AIMULW:
212 		case ANEGL:
213 		case ANOTL:
214 		case AADCL:
215 		case ASBBL:
216 			for(z=0; z<BITS; z++) {
217 				r->set.b[z] |= bit.b[z];
218 				r->use2.b[z] |= bit.b[z];
219 			}
220 			break;
221 
222 		/*
223 		 * funny
224 		 */
225 		case AFMOVDP:
226 		case AFMOVFP:
227 		case AFMOVLP:
228 		case AFMOVVP:
229 		case AFMOVWP:
230 		case ACALL:
231 			for(z=0; z<BITS; z++)
232 				addrs.b[z] |= bit.b[z];
233 			break;
234 		}
235 
236 		switch(p->as) {
237 		case AIMULL:
238 		case AIMULW:
239 			if(p->to.type != D_NONE)
240 				break;
241 
242 		case AIDIVB:
243 		case AIDIVL:
244 		case AIDIVW:
245 		case AIMULB:
246 		case ADIVB:
247 		case ADIVL:
248 		case ADIVW:
249 		case AMULB:
250 		case AMULL:
251 		case AMULW:
252 
253 		case ACWD:
254 		case ACDQ:
255 			r->regu |= RtoB(D_AX) | RtoB(D_DX);
256 			break;
257 
258 		case AREP:
259 		case AREPN:
260 		case ALOOP:
261 		case ALOOPEQ:
262 		case ALOOPNE:
263 			r->regu |= RtoB(D_CX);
264 			break;
265 
266 		case AMOVSB:
267 		case AMOVSL:
268 		case AMOVSW:
269 		case ACMPSB:
270 		case ACMPSL:
271 		case ACMPSW:
272 			r->regu |= RtoB(D_SI) | RtoB(D_DI);
273 			break;
274 
275 		case ASTOSB:
276 		case ASTOSL:
277 		case ASTOSW:
278 		case ASCASB:
279 		case ASCASL:
280 		case ASCASW:
281 			r->regu |= RtoB(D_AX) | RtoB(D_DI);
282 			break;
283 
284 		case AINSB:
285 		case AINSL:
286 		case AINSW:
287 		case AOUTSB:
288 		case AOUTSL:
289 		case AOUTSW:
290 			r->regu |= RtoB(D_DI) | RtoB(D_DX);
291 			break;
292 
293 		case AFSTSW:
294 		case ASAHF:
295 			r->regu |= RtoB(D_AX);
296 			break;
297 		}
298 	}
299 	if(firstr == R)
300 		return;
301 	initpc = pc - val;
302 	npc = val;
303 
304 	/*
305 	 * pass 2
306 	 * turn branch references to pointers
307 	 * build back pointers
308 	 */
309 	for(r = firstr; r != R; r = r->link) {
310 		p = r->prog;
311 		if(p->to.type == D_BRANCH) {
312 			val = p->to.offset - initpc;
313 			r1 = firstr;
314 			while(r1 != R) {
315 				r2 = r1->log5;
316 				if(r2 != R && val >= r2->pc) {
317 					r1 = r2;
318 					continue;
319 				}
320 				if(r1->pc == val)
321 					break;
322 				r1 = r1->link;
323 			}
324 			if(r1 == R) {
325 				nearln = p->lineno;
326 				diag(Z, "ref not found\n%P", p);
327 				continue;
328 			}
329 			if(r1 == r) {
330 				nearln = p->lineno;
331 				diag(Z, "ref to self\n%P", p);
332 				continue;
333 			}
334 			r->s2 = r1;
335 			r->p2link = r1->p2;
336 			r1->p2 = r;
337 		}
338 	}
339 	if(debug['R']) {
340 		p = firstr->prog;
341 		print("\n%L %D\n", p->lineno, &p->from);
342 	}
343 
344 	/*
345 	 * pass 2.5
346 	 * find looping structure
347 	 */
348 	for(r = firstr; r != R; r = r->link)
349 		r->active = 0;
350 	change = 0;
351 	loopit(firstr, npc);
352 	if(debug['R'] && debug['v']) {
353 		print("\nlooping structure:\n");
354 		for(r = firstr; r != R; r = r->link) {
355 			print("%ld:%P", r->loop, r->prog);
356 			for(z=0; z<BITS; z++)
357 				bit.b[z] = r->use1.b[z] |
358 					   r->use2.b[z] |
359 					   r->set.b[z];
360 			if(bany(&bit)) {
361 				print("\t");
362 				if(bany(&r->use1))
363 					print(" u1=%B", r->use1);
364 				if(bany(&r->use2))
365 					print(" u2=%B", r->use2);
366 				if(bany(&r->set))
367 					print(" st=%B", r->set);
368 			}
369 			print("\n");
370 		}
371 	}
372 
373 	/*
374 	 * pass 3
375 	 * iterate propagating usage
376 	 * 	back until flow graph is complete
377 	 */
378 loop1:
379 	change = 0;
380 	for(r = firstr; r != R; r = r->link)
381 		r->active = 0;
382 	for(r = firstr; r != R; r = r->link)
383 		if(r->prog->as == ARET)
384 			prop(r, zbits, zbits);
385 loop11:
386 	/* pick up unreachable code */
387 	i = 0;
388 	for(r = firstr; r != R; r = r1) {
389 		r1 = r->link;
390 		if(r1 && r1->active && !r->active) {
391 			prop(r, zbits, zbits);
392 			i = 1;
393 		}
394 	}
395 	if(i)
396 		goto loop11;
397 	if(change)
398 		goto loop1;
399 
400 
401 	/*
402 	 * pass 4
403 	 * iterate propagating register/variable synchrony
404 	 * 	forward until graph is complete
405 	 */
406 loop2:
407 	change = 0;
408 	for(r = firstr; r != R; r = r->link)
409 		r->active = 0;
410 	synch(firstr, zbits);
411 	if(change)
412 		goto loop2;
413 
414 
415 	/*
416 	 * pass 5
417 	 * isolate regions
418 	 * calculate costs (paint1)
419 	 */
420 	r = firstr;
421 	if(r) {
422 		for(z=0; z<BITS; z++)
423 			bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
424 			  ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
425 		if(bany(&bit)) {
426 			nearln = r->prog->lineno;
427 			warn(Z, "used and not set: %B", bit);
428 			if(debug['R'] && !debug['w'])
429 				print("used and not set: %B\n", bit);
430 		}
431 	}
432 	if(debug['R'] && debug['v'])
433 		print("\nprop structure:\n");
434 	for(r = firstr; r != R; r = r->link)
435 		r->act = zbits;
436 	rgp = region;
437 	nregion = 0;
438 	for(r = firstr; r != R; r = r->link) {
439 		if(debug['R'] && debug['v']) {
440 			print("%P\t", r->prog);
441 			if(bany(&r->set))
442 				print("s:%B ", r->set);
443 			if(bany(&r->refahead))
444 				print("ra:%B ", r->refahead);
445 			if(bany(&r->calahead))
446 				print("ca:%B ", r->calahead);
447 			print("\n");
448 		}
449 		for(z=0; z<BITS; z++)
450 			bit.b[z] = r->set.b[z] &
451 			  ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
452 		if(bany(&bit)) {
453 			nearln = r->prog->lineno;
454 			warn(Z, "set and not used: %B", bit);
455 			if(debug['R'])
456 				print("set and not used: %B\n", bit);
457 			excise(r);
458 		}
459 		for(z=0; z<BITS; z++)
460 			bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
461 		while(bany(&bit)) {
462 			i = bnum(bit);
463 			rgp->enter = r;
464 			rgp->varno = i;
465 			change = 0;
466 			if(debug['R'] && debug['v'])
467 				print("\n");
468 			paint1(r, i);
469 			bit.b[i/32] &= ~(1L<<(i%32));
470 			if(change <= 0) {
471 				if(debug['R'])
472 					print("%L$%d: %B\n",
473 						r->prog->lineno, change, blsh(i));
474 				continue;
475 			}
476 			rgp->cost = change;
477 			nregion++;
478 			if(nregion >= NRGN) {
479 				warn(Z, "too many regions");
480 				goto brk;
481 			}
482 			rgp++;
483 		}
484 	}
485 brk:
486 	qsort(region, nregion, sizeof(region[0]), rcmp);
487 
488 	/*
489 	 * pass 6
490 	 * determine used registers (paint2)
491 	 * replace code (paint3)
492 	 */
493 	rgp = region;
494 	for(i=0; i<nregion; i++) {
495 		bit = blsh(rgp->varno);
496 		vreg = paint2(rgp->enter, rgp->varno);
497 		vreg = allreg(vreg, rgp);
498 		if(debug['R']) {
499 			print("%L$%d %R: %B\n",
500 				rgp->enter->prog->lineno,
501 				rgp->cost,
502 				rgp->regno,
503 				bit);
504 		}
505 		if(rgp->regno != 0)
506 			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
507 		rgp++;
508 	}
509 	/*
510 	 * pass 7
511 	 * peep-hole on basic block
512 	 */
513 	if(!debug['R'] || debug['P'])
514 		peep();
515 
516 	/*
517 	 * pass 8
518 	 * recalculate pc
519 	 */
520 	val = initpc;
521 	for(r = firstr; r != R; r = r1) {
522 		r->pc = val;
523 		p = r->prog;
524 		p1 = P;
525 		r1 = r->link;
526 		if(r1 != R)
527 			p1 = r1->prog;
528 		for(; p != p1; p = p->link) {
529 			switch(p->as) {
530 			default:
531 				val++;
532 				break;
533 
534 			case ANOP:
535 			case ADATA:
536 			case AGLOBL:
537 			case ANAME:
538 			case ASIGNAME:
539 				break;
540 			}
541 		}
542 	}
543 	pc = val;
544 
545 	/*
546 	 * fix up branches
547 	 */
548 	if(debug['R'])
549 		if(bany(&addrs))
550 			print("addrs: %B\n", addrs);
551 
552 	r1 = 0; /* set */
553 	for(r = firstr; r != R; r = r->link) {
554 		p = r->prog;
555 		if(p->to.type == D_BRANCH)
556 			p->to.offset = r->s2->pc;
557 		r1 = r;
558 	}
559 
560 	/*
561 	 * last pass
562 	 * eliminate nops
563 	 * free aux structures
564 	 */
565 	for(p = firstr->prog; p != P; p = p->link){
566 		while(p->link && p->link->as == ANOP)
567 			p->link = p->link->link;
568 	}
569 	if(r1 != R) {
570 		r1->link = freer;
571 		freer = firstr;
572 	}
573 }
574 
575 /*
576  * add mov b,rn
577  * just after r
578  */
579 void
580 addmove(Reg *r, int bn, int rn, int f)
581 {
582 	Prog *p, *p1;
583 	Adr *a;
584 	Var *v;
585 
586 	p1 = alloc(sizeof(*p1));
587 	*p1 = zprog;
588 	p = r->prog;
589 
590 	p1->link = p->link;
591 	p->link = p1;
592 	p1->lineno = p->lineno;
593 
594 	v = var + bn;
595 
596 	a = &p1->to;
597 	a->sym = v->sym;
598 	a->offset = v->offset;
599 	a->etype = v->etype;
600 	a->type = v->name;
601 
602 	p1->as = AMOVL;
603 	if(v->etype == TCHAR || v->etype == TUCHAR)
604 		p1->as = AMOVB;
605 	if(v->etype == TSHORT || v->etype == TUSHORT)
606 		p1->as = AMOVW;
607 
608 	p1->from.type = rn;
609 	if(!f) {
610 		p1->from = *a;
611 		*a = zprog.from;
612 		a->type = rn;
613 		if(v->etype == TUCHAR)
614 			p1->as = AMOVB;
615 		if(v->etype == TUSHORT)
616 			p1->as = AMOVW;
617 	}
618 	if(debug['R'])
619 		print("%P\t.a%P\n", p, p1);
620 }
621 
622 ulong
623 doregbits(int r)
624 {
625 	ulong b;
626 
627 	b = 0;
628 	if(r >= D_INDIR)
629 		r -= D_INDIR;
630 	if(r >= D_AX && r <= D_DI)
631 		b |= RtoB(r);
632 	else
633 	if(r >= D_AL && r <= D_BL)
634 		b |= RtoB(r-D_AL+D_AX);
635 	else
636 	if(r >= D_AH && r <= D_BH)
637 		b |= RtoB(r-D_AH+D_AX);
638 	return b;
639 }
640 
641 Bits
642 mkvar(Reg *r, Adr *a)
643 {
644 	Var *v;
645 	int i, t, n, et, z;
646 	long o;
647 	Bits bit;
648 	Sym *s;
649 
650 	/*
651 	 * mark registers used
652 	 */
653 	t = a->type;
654 	r->regu |= doregbits(t);
655 	r->regu |= doregbits(a->index);
656 
657 	switch(t) {
658 	default:
659 		goto none;
660 	case D_ADDR:
661 		a->type = a->index;
662 		bit = mkvar(r, a);
663 		for(z=0; z<BITS; z++)
664 			addrs.b[z] |= bit.b[z];
665 		a->type = t;
666 		goto none;
667 	case D_EXTERN:
668 	case D_STATIC:
669 	case D_PARAM:
670 	case D_AUTO:
671 		n = t;
672 		break;
673 	}
674 	s = a->sym;
675 	if(s == S)
676 		goto none;
677 	if(s->name[0] == '.')
678 		goto none;
679 	et = a->etype;
680 	o = a->offset;
681 	v = var;
682 	for(i=0; i<nvar; i++) {
683 		if(s == v->sym)
684 		if(n == v->name)
685 		if(o == v->offset)
686 			goto out;
687 		v++;
688 	}
689 	if(nvar >= NVAR) {
690 		if(debug['w'] > 1 && s)
691 			warn(Z, "variable not optimized: %s", s->name);
692 		goto none;
693 	}
694 	i = nvar;
695 	nvar++;
696 	v = &var[i];
697 	v->sym = s;
698 	v->offset = o;
699 	v->name = n;
700 	v->etype = et;
701 	if(debug['R'])
702 		print("bit=%2d et=%2d %D\n", i, et, a);
703 
704 out:
705 	bit = blsh(i);
706 	if(n == D_EXTERN || n == D_STATIC)
707 		for(z=0; z<BITS; z++)
708 			externs.b[z] |= bit.b[z];
709 	if(n == D_PARAM)
710 		for(z=0; z<BITS; z++)
711 			params.b[z] |= bit.b[z];
712 	if(v->etype != et || !typechlpfd[et])	/* funny punning */
713 		for(z=0; z<BITS; z++)
714 			addrs.b[z] |= bit.b[z];
715 	return bit;
716 
717 none:
718 	return zbits;
719 }
720 
721 void
722 prop(Reg *r, Bits ref, Bits cal)
723 {
724 	Reg *r1, *r2;
725 	int z;
726 
727 	for(r1 = r; r1 != R; r1 = r1->p1) {
728 		for(z=0; z<BITS; z++) {
729 			ref.b[z] |= r1->refahead.b[z];
730 			if(ref.b[z] != r1->refahead.b[z]) {
731 				r1->refahead.b[z] = ref.b[z];
732 				change++;
733 			}
734 			cal.b[z] |= r1->calahead.b[z];
735 			if(cal.b[z] != r1->calahead.b[z]) {
736 				r1->calahead.b[z] = cal.b[z];
737 				change++;
738 			}
739 		}
740 		switch(r1->prog->as) {
741 		case ACALL:
742 			for(z=0; z<BITS; z++) {
743 				cal.b[z] |= ref.b[z] | externs.b[z];
744 				ref.b[z] = 0;
745 			}
746 			break;
747 
748 		case ATEXT:
749 			for(z=0; z<BITS; z++) {
750 				cal.b[z] = 0;
751 				ref.b[z] = 0;
752 			}
753 			break;
754 
755 		case ARET:
756 			for(z=0; z<BITS; z++) {
757 				cal.b[z] = externs.b[z];
758 				ref.b[z] = 0;
759 			}
760 		}
761 		for(z=0; z<BITS; z++) {
762 			ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
763 				r1->use1.b[z] | r1->use2.b[z];
764 			cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
765 			r1->refbehind.b[z] = ref.b[z];
766 			r1->calbehind.b[z] = cal.b[z];
767 		}
768 		if(r1->active)
769 			break;
770 		r1->active = 1;
771 	}
772 	for(; r != r1; r = r->p1)
773 		for(r2 = r->p2; r2 != R; r2 = r2->p2link)
774 			prop(r2, r->refbehind, r->calbehind);
775 }
776 
777 /*
778  * find looping structure
779  *
780  * 1) find reverse postordering
781  * 2) find approximate dominators,
782  *	the actual dominators if the flow graph is reducible
783  *	otherwise, dominators plus some other non-dominators.
784  *	See Matthew S. Hecht and Jeffrey D. Ullman,
785  *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
786  *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
787  *	Oct. 1-3, 1973, pp.  207-217.
788  * 3) find all nodes with a predecessor dominated by the current node.
789  *	such a node is a loop head.
790  *	recursively, all preds with a greater rpo number are in the loop
791  */
792 long
793 postorder(Reg *r, Reg **rpo2r, long n)
794 {
795 	Reg *r1;
796 
797 	r->rpo = 1;
798 	r1 = r->s1;
799 	if(r1 && !r1->rpo)
800 		n = postorder(r1, rpo2r, n);
801 	r1 = r->s2;
802 	if(r1 && !r1->rpo)
803 		n = postorder(r1, rpo2r, n);
804 	rpo2r[n] = r;
805 	n++;
806 	return n;
807 }
808 
809 long
810 rpolca(long *idom, long rpo1, long rpo2)
811 {
812 	long t;
813 
814 	if(rpo1 == -1)
815 		return rpo2;
816 	while(rpo1 != rpo2){
817 		if(rpo1 > rpo2){
818 			t = rpo2;
819 			rpo2 = rpo1;
820 			rpo1 = t;
821 		}
822 		while(rpo1 < rpo2){
823 			t = idom[rpo2];
824 			if(t >= rpo2)
825 				fatal(Z, "bad idom");
826 			rpo2 = t;
827 		}
828 	}
829 	return rpo1;
830 }
831 
832 int
833 doms(long *idom, long r, long s)
834 {
835 	while(s > r)
836 		s = idom[s];
837 	return s == r;
838 }
839 
840 int
841 loophead(long *idom, Reg *r)
842 {
843 	long src;
844 
845 	src = r->rpo;
846 	if(r->p1 != R && doms(idom, src, r->p1->rpo))
847 		return 1;
848 	for(r = r->p2; r != R; r = r->p2link)
849 		if(doms(idom, src, r->rpo))
850 			return 1;
851 	return 0;
852 }
853 
854 void
855 loopmark(Reg **rpo2r, long head, Reg *r)
856 {
857 	if(r->rpo < head || r->active == head)
858 		return;
859 	r->active = head;
860 	r->loop += LOOP;
861 	if(r->p1 != R)
862 		loopmark(rpo2r, head, r->p1);
863 	for(r = r->p2; r != R; r = r->p2link)
864 		loopmark(rpo2r, head, r);
865 }
866 
867 void
868 loopit(Reg *r, long nr)
869 {
870 	Reg *r1;
871 	long i, d, me;
872 
873 	if(nr > maxnr) {
874 		rpo2r = alloc(nr * sizeof(Reg*));
875 		idom = alloc(nr * sizeof(long));
876 		maxnr = nr;
877 	}
878 
879 	d = postorder(r, rpo2r, 0);
880 	if(d > nr)
881 		fatal(Z, "too many reg nodes");
882 	nr = d;
883 	for(i = 0; i < nr / 2; i++){
884 		r1 = rpo2r[i];
885 		rpo2r[i] = rpo2r[nr - 1 - i];
886 		rpo2r[nr - 1 - i] = r1;
887 	}
888 	for(i = 0; i < nr; i++)
889 		rpo2r[i]->rpo = i;
890 
891 	idom[0] = 0;
892 	for(i = 0; i < nr; i++){
893 		r1 = rpo2r[i];
894 		me = r1->rpo;
895 		d = -1;
896 		if(r1->p1 != R && r1->p1->rpo < me)
897 			d = r1->p1->rpo;
898 		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
899 			if(r1->rpo < me)
900 				d = rpolca(idom, d, r1->rpo);
901 		idom[i] = d;
902 	}
903 
904 	for(i = 0; i < nr; i++){
905 		r1 = rpo2r[i];
906 		r1->loop++;
907 		if(r1->p2 != R && loophead(idom, r1))
908 			loopmark(rpo2r, i, r1);
909 	}
910 }
911 
912 void
913 synch(Reg *r, Bits dif)
914 {
915 	Reg *r1;
916 	int z;
917 
918 	for(r1 = r; r1 != R; r1 = r1->s1) {
919 		for(z=0; z<BITS; z++) {
920 			dif.b[z] = (dif.b[z] &
921 				~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
922 					r1->set.b[z] | r1->regdiff.b[z];
923 			if(dif.b[z] != r1->regdiff.b[z]) {
924 				r1->regdiff.b[z] = dif.b[z];
925 				change++;
926 			}
927 		}
928 		if(r1->active)
929 			break;
930 		r1->active = 1;
931 		for(z=0; z<BITS; z++)
932 			dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
933 		if(r1->s2 != R)
934 			synch(r1->s2, dif);
935 	}
936 }
937 
938 ulong
939 allreg(ulong b, Rgn *r)
940 {
941 	Var *v;
942 	int i;
943 
944 	v = var + r->varno;
945 	r->regno = 0;
946 	switch(v->etype) {
947 
948 	default:
949 		diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
950 		break;
951 
952 	case TCHAR:
953 	case TUCHAR:
954 	case TSHORT:
955 	case TUSHORT:
956 	case TINT:
957 	case TUINT:
958 	case TLONG:
959 	case TULONG:
960 	case TIND:
961 	case TARRAY:
962 		i = BtoR(~b);
963 		if(i && r->cost > 0) {
964 			r->regno = i;
965 			return RtoB(i);
966 		}
967 		break;
968 
969 	case TDOUBLE:
970 	case TFLOAT:
971 		break;
972 	}
973 	return 0;
974 }
975 
976 void
977 paint1(Reg *r, int bn)
978 {
979 	Reg *r1;
980 	Prog *p;
981 	int z;
982 	ulong bb;
983 
984 	z = bn/32;
985 	bb = 1L<<(bn%32);
986 	if(r->act.b[z] & bb)
987 		return;
988 	for(;;) {
989 		if(!(r->refbehind.b[z] & bb))
990 			break;
991 		r1 = r->p1;
992 		if(r1 == R)
993 			break;
994 		if(!(r1->refahead.b[z] & bb))
995 			break;
996 		if(r1->act.b[z] & bb)
997 			break;
998 		r = r1;
999 	}
1000 
1001 	if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1002 		change -= CLOAD * r->loop;
1003 		if(debug['R'] && debug['v'])
1004 			print("%ld%P\tld %B $%d\n", r->loop,
1005 				r->prog, blsh(bn), change);
1006 	}
1007 	for(;;) {
1008 		r->act.b[z] |= bb;
1009 		p = r->prog;
1010 
1011 		if(r->use1.b[z] & bb) {
1012 			change += CREF * r->loop;
1013 			if(p->as == AFMOVL)
1014 				if(BtoR(bb) != D_F0)
1015 					change = -CINF;
1016 			if(debug['R'] && debug['v'])
1017 				print("%ld%P\tu1 %B $%d\n", r->loop,
1018 					p, blsh(bn), change);
1019 		}
1020 
1021 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1022 			change += CREF * r->loop;
1023 			if(p->as == AFMOVL)
1024 				if(BtoR(bb) != D_F0)
1025 					change = -CINF;
1026 			if(debug['R'] && debug['v'])
1027 				print("%ld%P\tu2 %B $%d\n", r->loop,
1028 					p, blsh(bn), change);
1029 		}
1030 
1031 		if(STORE(r) & r->regdiff.b[z] & bb) {
1032 			change -= CLOAD * r->loop;
1033 			if(p->as == AFMOVL)
1034 				if(BtoR(bb) != D_F0)
1035 					change = -CINF;
1036 			if(debug['R'] && debug['v'])
1037 				print("%ld%P\tst %B $%d\n", r->loop,
1038 					p, blsh(bn), change);
1039 		}
1040 
1041 		if(r->refbehind.b[z] & bb)
1042 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1043 				if(r1->refahead.b[z] & bb)
1044 					paint1(r1, bn);
1045 
1046 		if(!(r->refahead.b[z] & bb))
1047 			break;
1048 		r1 = r->s2;
1049 		if(r1 != R)
1050 			if(r1->refbehind.b[z] & bb)
1051 				paint1(r1, bn);
1052 		r = r->s1;
1053 		if(r == R)
1054 			break;
1055 		if(r->act.b[z] & bb)
1056 			break;
1057 		if(!(r->refbehind.b[z] & bb))
1058 			break;
1059 	}
1060 }
1061 
1062 ulong
1063 regset(Reg *r, ulong bb)
1064 {
1065 	ulong b, set;
1066 	Adr v;
1067 	int c;
1068 
1069 	set = 0;
1070 	v = zprog.from;
1071 	while(b = bb & ~(bb-1)) {
1072 		v.type = BtoR(b);
1073 		c = copyu(r->prog, &v, A);
1074 		if(c == 3)
1075 			set |= b;
1076 		bb &= ~b;
1077 	}
1078 	return set;
1079 }
1080 
1081 ulong
1082 reguse(Reg *r, ulong bb)
1083 {
1084 	ulong b, set;
1085 	Adr v;
1086 	int c;
1087 
1088 	set = 0;
1089 	v = zprog.from;
1090 	while(b = bb & ~(bb-1)) {
1091 		v.type = BtoR(b);
1092 		c = copyu(r->prog, &v, A);
1093 		if(c == 1 || c == 2 || c == 4)
1094 			set |= b;
1095 		bb &= ~b;
1096 	}
1097 	return set;
1098 }
1099 
1100 ulong
1101 paint2(Reg *r, int bn)
1102 {
1103 	Reg *r1;
1104 	int z;
1105 	ulong bb, vreg, x;
1106 
1107 	z = bn/32;
1108 	bb = 1L << (bn%32);
1109 	vreg = regbits;
1110 	if(!(r->act.b[z] & bb))
1111 		return vreg;
1112 	for(;;) {
1113 		if(!(r->refbehind.b[z] & bb))
1114 			break;
1115 		r1 = r->p1;
1116 		if(r1 == R)
1117 			break;
1118 		if(!(r1->refahead.b[z] & bb))
1119 			break;
1120 		if(!(r1->act.b[z] & bb))
1121 			break;
1122 		r = r1;
1123 	}
1124 	for(;;) {
1125 		r->act.b[z] &= ~bb;
1126 
1127 		vreg |= r->regu;
1128 
1129 		if(r->refbehind.b[z] & bb)
1130 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1131 				if(r1->refahead.b[z] & bb)
1132 					vreg |= paint2(r1, bn);
1133 
1134 		if(!(r->refahead.b[z] & bb))
1135 			break;
1136 		r1 = r->s2;
1137 		if(r1 != R)
1138 			if(r1->refbehind.b[z] & bb)
1139 				vreg |= paint2(r1, bn);
1140 		r = r->s1;
1141 		if(r == R)
1142 			break;
1143 		if(!(r->act.b[z] & bb))
1144 			break;
1145 		if(!(r->refbehind.b[z] & bb))
1146 			break;
1147 	}
1148 
1149 	bb = vreg;
1150 	for(; r; r=r->s1) {
1151 		x = r->regu & ~bb;
1152 		if(x) {
1153 			vreg |= reguse(r, x);
1154 			bb |= regset(r, x);
1155 		}
1156 	}
1157 	return vreg;
1158 }
1159 
1160 void
1161 paint3(Reg *r, int bn, long rb, int rn)
1162 {
1163 	Reg *r1;
1164 	Prog *p;
1165 	int z;
1166 	ulong bb;
1167 
1168 	z = bn/32;
1169 	bb = 1L << (bn%32);
1170 	if(r->act.b[z] & bb)
1171 		return;
1172 	for(;;) {
1173 		if(!(r->refbehind.b[z] & bb))
1174 			break;
1175 		r1 = r->p1;
1176 		if(r1 == R)
1177 			break;
1178 		if(!(r1->refahead.b[z] & bb))
1179 			break;
1180 		if(r1->act.b[z] & bb)
1181 			break;
1182 		r = r1;
1183 	}
1184 
1185 	if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1186 		addmove(r, bn, rn, 0);
1187 	for(;;) {
1188 		r->act.b[z] |= bb;
1189 		p = r->prog;
1190 
1191 		if(r->use1.b[z] & bb) {
1192 			if(debug['R'])
1193 				print("%P", p);
1194 			addreg(&p->from, rn);
1195 			if(debug['R'])
1196 				print("\t.c%P\n", p);
1197 		}
1198 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1199 			if(debug['R'])
1200 				print("%P", p);
1201 			addreg(&p->to, rn);
1202 			if(debug['R'])
1203 				print("\t.c%P\n", p);
1204 		}
1205 
1206 		if(STORE(r) & r->regdiff.b[z] & bb)
1207 			addmove(r, bn, rn, 1);
1208 		r->regu |= rb;
1209 
1210 		if(r->refbehind.b[z] & bb)
1211 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1212 				if(r1->refahead.b[z] & bb)
1213 					paint3(r1, bn, rb, rn);
1214 
1215 		if(!(r->refahead.b[z] & bb))
1216 			break;
1217 		r1 = r->s2;
1218 		if(r1 != R)
1219 			if(r1->refbehind.b[z] & bb)
1220 				paint3(r1, bn, rb, rn);
1221 		r = r->s1;
1222 		if(r == R)
1223 			break;
1224 		if(r->act.b[z] & bb)
1225 			break;
1226 		if(!(r->refbehind.b[z] & bb))
1227 			break;
1228 	}
1229 }
1230 
1231 void
1232 addreg(Adr *a, int rn)
1233 {
1234 
1235 	a->sym = 0;
1236 	a->offset = 0;
1237 	a->type = rn;
1238 }
1239 
1240 long
1241 RtoB(int r)
1242 {
1243 
1244 	if(r < D_AX || r > D_DI)
1245 		return 0;
1246 	return 1L << (r-D_AX);
1247 }
1248 
1249 int
1250 BtoR(long b)
1251 {
1252 
1253 	b &= 0xffL;
1254 	if(b == 0)
1255 		return 0;
1256 	return bitno(b) + D_AX;
1257 }
1258