xref: /inferno-os/utils/8c/reg.c (revision 9dbf735d35c339c90deaed43fc0ae17f16c122f7)
1 #include "gc.h"
2 
3 Reg*
4 rega(void)
5 {
6 	Reg *r;
7 
8 	r = freer;
9 	if(r == R) {
10 		r = alloc(sizeof(*r));
11 	} else
12 		freer = r->link;
13 
14 	*r = zreg;
15 	return r;
16 }
17 
18 int
19 rcmp(void *a1, void *a2)
20 {
21 	Rgn *p1, *p2;
22 	int c1, c2;
23 
24 	p1 = (Rgn*)a1;
25 	p2 = (Rgn*)a2;
26 	c1 = p2->cost;
27 	c2 = p1->cost;
28 	if(c1 -= c2)
29 		return c1;
30 	return p2->varno - p1->varno;
31 }
32 
33 void
34 regopt(Prog *p)
35 {
36 	Reg *r, *r1, *r2;
37 	Prog *p1;
38 	int i, z;
39 	long initpc, val, npc;
40 	ulong vreg;
41 	Bits bit;
42 	struct
43 	{
44 		long	m;
45 		long	c;
46 		Reg*	p;
47 	} log5[6], *lp;
48 
49 	firstr = R;
50 	lastr = R;
51 	nvar = 0;
52 	regbits = RtoB(D_SP) | RtoB(D_AX);
53 	for(z=0; z<BITS; z++) {
54 		externs.b[z] = 0;
55 		params.b[z] = 0;
56 		consts.b[z] = 0;
57 		addrs.b[z] = 0;
58 	}
59 
60 	/*
61 	 * pass 1
62 	 * build aux data structure
63 	 * allocate pcs
64 	 * find use and set of variables
65 	 */
66 	val = 5L * 5L * 5L * 5L * 5L;
67 	lp = log5;
68 	for(i=0; i<5; i++) {
69 		lp->m = val;
70 		lp->c = 0;
71 		lp->p = R;
72 		val /= 5L;
73 		lp++;
74 	}
75 	val = 0;
76 	for(; p != P; p = p->link) {
77 		switch(p->as) {
78 		case ADATA:
79 		case AGLOBL:
80 		case ANAME:
81 		case ASIGNAME:
82 			continue;
83 		}
84 		r = rega();
85 		if(firstr == R) {
86 			firstr = r;
87 			lastr = r;
88 		} else {
89 			lastr->link = r;
90 			r->p1 = lastr;
91 			lastr->s1 = r;
92 			lastr = r;
93 		}
94 		r->prog = p;
95 		r->pc = val;
96 		val++;
97 
98 		lp = log5;
99 		for(i=0; i<5; i++) {
100 			lp->c--;
101 			if(lp->c <= 0) {
102 				lp->c = lp->m;
103 				if(lp->p != R)
104 					lp->p->log5 = r;
105 				lp->p = r;
106 				(lp+1)->c = 0;
107 				break;
108 			}
109 			lp++;
110 		}
111 
112 		r1 = r->p1;
113 		if(r1 != R)
114 		switch(r1->prog->as) {
115 		case ARET:
116 		case AJMP:
117 		case AIRETL:
118 			r->p1 = R;
119 			r1->s1 = R;
120 		}
121 
122 		bit = mkvar(r, &p->from, p->as==AMOVL);
123 		if(bany(&bit))
124 		switch(p->as) {
125 		/*
126 		 * funny
127 		 */
128 		case ALEAL:
129 			for(z=0; z<BITS; z++)
130 				addrs.b[z] |= bit.b[z];
131 			break;
132 
133 		/*
134 		 * left side read
135 		 */
136 		default:
137 			for(z=0; z<BITS; z++)
138 				r->use1.b[z] |= bit.b[z];
139 			break;
140 		}
141 
142 		bit = mkvar(r, &p->to, 0);
143 		if(bany(&bit))
144 		switch(p->as) {
145 		default:
146 			diag(Z, "reg: unknown op: %A", p->as);
147 			break;
148 
149 		/*
150 		 * right side read
151 		 */
152 		case ACMPB:
153 		case ACMPL:
154 		case ACMPW:
155 			for(z=0; z<BITS; z++)
156 				r->use2.b[z] |= bit.b[z];
157 			break;
158 
159 		/*
160 		 * right side write
161 		 */
162 		case ANOP:
163 		case AMOVL:
164 		case AMOVB:
165 		case AMOVW:
166 		case AMOVBLSX:
167 		case AMOVBLZX:
168 		case AMOVWLSX:
169 		case AMOVWLZX:
170 			for(z=0; z<BITS; z++)
171 				r->set.b[z] |= bit.b[z];
172 			break;
173 
174 		/*
175 		 * right side read+write
176 		 */
177 		case AADDB:
178 		case AADDL:
179 		case AADDW:
180 		case AANDB:
181 		case AANDL:
182 		case AANDW:
183 		case ASUBB:
184 		case ASUBL:
185 		case ASUBW:
186 		case AORB:
187 		case AORL:
188 		case AORW:
189 		case AXORB:
190 		case AXORL:
191 		case AXORW:
192 		case ASALB:
193 		case ASALL:
194 		case ASALW:
195 		case ASARB:
196 		case ASARL:
197 		case ASARW:
198 		case AROLB:
199 		case AROLL:
200 		case AROLW:
201 		case ARORB:
202 		case ARORL:
203 		case ARORW:
204 		case ASHLB:
205 		case ASHLL:
206 		case ASHLW:
207 		case ASHRB:
208 		case ASHRL:
209 		case ASHRW:
210 		case AIMULL:
211 		case AIMULW:
212 		case ANEGL:
213 		case ANOTL:
214 		case AADCL:
215 		case ASBBL:
216 			for(z=0; z<BITS; z++) {
217 				r->set.b[z] |= bit.b[z];
218 				r->use2.b[z] |= bit.b[z];
219 			}
220 			break;
221 
222 		/*
223 		 * funny
224 		 */
225 		case AFMOVDP:
226 		case AFMOVFP:
227 		case AFMOVLP:
228 		case AFMOVVP:
229 		case AFMOVWP:
230 		case ACALL:
231 			for(z=0; z<BITS; z++)
232 				addrs.b[z] |= bit.b[z];
233 			break;
234 		}
235 
236 		switch(p->as) {
237 		case AIMULL:
238 		case AIMULW:
239 			if(p->to.type != D_NONE)
240 				break;
241 
242 		case AIDIVB:
243 		case AIDIVL:
244 		case AIDIVW:
245 		case AIMULB:
246 		case ADIVB:
247 		case ADIVL:
248 		case ADIVW:
249 		case AMULB:
250 		case AMULL:
251 		case AMULW:
252 
253 		case ACWD:
254 		case ACDQ:
255 			r->regu |= RtoB(D_AX) | RtoB(D_DX);
256 			break;
257 
258 		case AREP:
259 		case AREPN:
260 		case ALOOP:
261 		case ALOOPEQ:
262 		case ALOOPNE:
263 			r->regu |= RtoB(D_CX);
264 			break;
265 
266 		case AMOVSB:
267 		case AMOVSL:
268 		case AMOVSW:
269 		case ACMPSB:
270 		case ACMPSL:
271 		case ACMPSW:
272 			r->regu |= RtoB(D_SI) | RtoB(D_DI);
273 			break;
274 
275 		case ASTOSB:
276 		case ASTOSL:
277 		case ASTOSW:
278 		case ASCASB:
279 		case ASCASL:
280 		case ASCASW:
281 			r->regu |= RtoB(D_AX) | RtoB(D_DI);
282 			break;
283 
284 		case AINSB:
285 		case AINSL:
286 		case AINSW:
287 		case AOUTSB:
288 		case AOUTSL:
289 		case AOUTSW:
290 			r->regu |= RtoB(D_DI) | RtoB(D_DX);
291 			break;
292 
293 		case AFSTSW:
294 		case ASAHF:
295 			r->regu |= RtoB(D_AX);
296 			break;
297 		}
298 	}
299 	if(firstr == R)
300 		return;
301 	initpc = pc - val;
302 	npc = val;
303 
304 	/*
305 	 * pass 2
306 	 * turn branch references to pointers
307 	 * build back pointers
308 	 */
309 	for(r = firstr; r != R; r = r->link) {
310 		p = r->prog;
311 		if(p->to.type == D_BRANCH) {
312 			val = p->to.offset - initpc;
313 			r1 = firstr;
314 			while(r1 != R) {
315 				r2 = r1->log5;
316 				if(r2 != R && val >= r2->pc) {
317 					r1 = r2;
318 					continue;
319 				}
320 				if(r1->pc == val)
321 					break;
322 				r1 = r1->link;
323 			}
324 			if(r1 == R) {
325 				nearln = p->lineno;
326 				diag(Z, "ref not found\n%P", p);
327 				continue;
328 			}
329 			if(r1 == r) {
330 				nearln = p->lineno;
331 				diag(Z, "ref to self\n%P", p);
332 				continue;
333 			}
334 			r->s2 = r1;
335 			r->p2link = r1->p2;
336 			r1->p2 = r;
337 		}
338 	}
339 	if(debug['R']) {
340 		p = firstr->prog;
341 		print("\n%L %D\n", p->lineno, &p->from);
342 	}
343 
344 	/*
345 	 * pass 2.5
346 	 * find looping structure
347 	 */
348 	for(r = firstr; r != R; r = r->link)
349 		r->active = 0;
350 	change = 0;
351 	loopit(firstr, npc);
352 	if(debug['R'] && debug['v']) {
353 		print("\nlooping structure:\n");
354 		for(r = firstr; r != R; r = r->link) {
355 			print("%ld:%P", r->loop, r->prog);
356 			for(z=0; z<BITS; z++)
357 				bit.b[z] = r->use1.b[z] |
358 					   r->use2.b[z] |
359 					   r->set.b[z];
360 			if(bany(&bit)) {
361 				print("\t");
362 				if(bany(&r->use1))
363 					print(" u1=%B", r->use1);
364 				if(bany(&r->use2))
365 					print(" u2=%B", r->use2);
366 				if(bany(&r->set))
367 					print(" st=%B", r->set);
368 			}
369 			print("\n");
370 		}
371 	}
372 
373 	/*
374 	 * pass 3
375 	 * iterate propagating usage
376 	 * 	back until flow graph is complete
377 	 */
378 loop1:
379 	change = 0;
380 	for(r = firstr; r != R; r = r->link)
381 		r->active = 0;
382 	for(r = firstr; r != R; r = r->link)
383 		if(r->prog->as == ARET)
384 			prop(r, zbits, zbits);
385 loop11:
386 	/* pick up unreachable code */
387 	i = 0;
388 	for(r = firstr; r != R; r = r1) {
389 		r1 = r->link;
390 		if(r1 && r1->active && !r->active) {
391 			prop(r, zbits, zbits);
392 			i = 1;
393 		}
394 	}
395 	if(i)
396 		goto loop11;
397 	if(change)
398 		goto loop1;
399 
400 
401 	/*
402 	 * pass 4
403 	 * iterate propagating register/variable synchrony
404 	 * 	forward until graph is complete
405 	 */
406 loop2:
407 	change = 0;
408 	for(r = firstr; r != R; r = r->link)
409 		r->active = 0;
410 	synch(firstr, zbits);
411 	if(change)
412 		goto loop2;
413 
414 
415 	/*
416 	 * pass 5
417 	 * isolate regions
418 	 * calculate costs (paint1)
419 	 */
420 	r = firstr;
421 	if(r) {
422 		for(z=0; z<BITS; z++)
423 			bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
424 			  ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
425 		if(bany(&bit)) {
426 			nearln = r->prog->lineno;
427 			warn(Z, "used and not set: %B", bit);
428 			if(debug['R'] && !debug['w'])
429 				print("used and not set: %B\n", bit);
430 		}
431 	}
432 	if(debug['R'] && debug['v'])
433 		print("\nprop structure:\n");
434 	for(r = firstr; r != R; r = r->link)
435 		r->act = zbits;
436 	rgp = region;
437 	nregion = 0;
438 	for(r = firstr; r != R; r = r->link) {
439 		if(debug['R'] && debug['v']) {
440 			print("%P\t", r->prog);
441 			if(bany(&r->set))
442 				print("s:%B ", r->set);
443 			if(bany(&r->refahead))
444 				print("ra:%B ", r->refahead);
445 			if(bany(&r->calahead))
446 				print("ca:%B ", r->calahead);
447 			print("\n");
448 		}
449 		for(z=0; z<BITS; z++)
450 			bit.b[z] = r->set.b[z] &
451 			  ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
452 		if(bany(&bit)) {
453 			nearln = r->prog->lineno;
454 			warn(Z, "set and not used: %B", bit);
455 			if(debug['R'])
456 				print("set and not used: %B\n", bit);
457 			excise(r);
458 		}
459 		for(z=0; z<BITS; z++)
460 			bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
461 		while(bany(&bit)) {
462 			i = bnum(bit);
463 			rgp->enter = r;
464 			rgp->varno = i;
465 			change = 0;
466 			if(debug['R'] && debug['v'])
467 				print("\n");
468 			paint1(r, i);
469 			bit.b[i/32] &= ~(1L<<(i%32));
470 			if(change <= 0) {
471 				if(debug['R'])
472 					print("%L$%d: %B\n",
473 						r->prog->lineno, change, blsh(i));
474 				continue;
475 			}
476 			rgp->cost = change;
477 			nregion++;
478 			if(nregion >= NRGN) {
479 				warn(Z, "too many regions");
480 				goto brk;
481 			}
482 			rgp++;
483 		}
484 	}
485 brk:
486 	qsort(region, nregion, sizeof(region[0]), rcmp);
487 
488 	/*
489 	 * pass 6
490 	 * determine used registers (paint2)
491 	 * replace code (paint3)
492 	 */
493 	rgp = region;
494 	for(i=0; i<nregion; i++) {
495 		bit = blsh(rgp->varno);
496 		vreg = paint2(rgp->enter, rgp->varno);
497 		vreg = allreg(vreg, rgp);
498 		if(debug['R']) {
499 			print("%L$%d %R: %B\n",
500 				rgp->enter->prog->lineno,
501 				rgp->cost,
502 				rgp->regno,
503 				bit);
504 		}
505 		if(rgp->regno != 0)
506 			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
507 		rgp++;
508 	}
509 	/*
510 	 * pass 7
511 	 * peep-hole on basic block
512 	 */
513 	if(!debug['R'] || debug['P'])
514 		peep();
515 
516 	/*
517 	 * pass 8
518 	 * recalculate pc
519 	 */
520 	val = initpc;
521 	for(r = firstr; r != R; r = r1) {
522 		r->pc = val;
523 		p = r->prog;
524 		p1 = P;
525 		r1 = r->link;
526 		if(r1 != R)
527 			p1 = r1->prog;
528 		for(; p != p1; p = p->link) {
529 			switch(p->as) {
530 			default:
531 				val++;
532 				break;
533 
534 			case ANOP:
535 			case ADATA:
536 			case AGLOBL:
537 			case ANAME:
538 			case ASIGNAME:
539 				break;
540 			}
541 		}
542 	}
543 	pc = val;
544 
545 	/*
546 	 * fix up branches
547 	 */
548 	if(debug['R'])
549 		if(bany(&addrs))
550 			print("addrs: %B\n", addrs);
551 
552 	r1 = 0; /* set */
553 	for(r = firstr; r != R; r = r->link) {
554 		p = r->prog;
555 		if(p->to.type == D_BRANCH)
556 			p->to.offset = r->s2->pc;
557 		r1 = r;
558 	}
559 
560 	/*
561 	 * last pass
562 	 * eliminate nops
563 	 * free aux structures
564 	 */
565 	for(p = firstr->prog; p != P; p = p->link){
566 		while(p->link && p->link->as == ANOP)
567 			p->link = p->link->link;
568 	}
569 	if(r1 != R) {
570 		r1->link = freer;
571 		freer = firstr;
572 	}
573 }
574 
575 /*
576  * add mov b,rn
577  * just after r
578  */
579 void
580 addmove(Reg *r, int bn, int rn, int f)
581 {
582 	Prog *p, *p1;
583 	Adr *a;
584 	Var *v;
585 
586 	p1 = alloc(sizeof(*p1));
587 	*p1 = zprog;
588 	p = r->prog;
589 
590 	p1->link = p->link;
591 	p->link = p1;
592 	p1->lineno = p->lineno;
593 
594 	v = var + bn;
595 
596 	a = &p1->to;
597 	a->sym = v->sym;
598 	a->offset = v->offset;
599 	a->etype = v->etype;
600 	a->type = v->name;
601 
602 	p1->as = AMOVL;
603 	if(v->etype == TCHAR || v->etype == TUCHAR)
604 		p1->as = AMOVB;
605 	if(v->etype == TSHORT || v->etype == TUSHORT)
606 		p1->as = AMOVW;
607 
608 	p1->from.type = rn;
609 	if(!f) {
610 		p1->from = *a;
611 		*a = zprog.from;
612 		a->type = rn;
613 		if(v->etype == TUCHAR)
614 			p1->as = AMOVB;
615 		if(v->etype == TUSHORT)
616 			p1->as = AMOVW;
617 	}
618 	if(debug['R'])
619 		print("%P\t.a%P\n", p, p1);
620 }
621 
622 ulong
623 doregbits(int r)
624 {
625 	ulong b;
626 
627 	b = 0;
628 	if(r >= D_INDIR)
629 		r -= D_INDIR;
630 	if(r >= D_AX && r <= D_DI)
631 		b |= RtoB(r);
632 	else
633 	if(r >= D_AL && r <= D_BL)
634 		b |= RtoB(r-D_AL+D_AX);
635 	else
636 	if(r >= D_AH && r <= D_BH)
637 		b |= RtoB(r-D_AH+D_AX);
638 	return b;
639 }
640 
641 Bits
642 mkvar(Reg *r, Adr *a, int isro)
643 {
644 	Var *v;
645 	int i, t, n, et, z;
646 	long o;
647 	Bits bit;
648 	Sym *s;
649 
650 	/*
651 	 * mark registers used
652 	 */
653 	t = a->type;
654 	r->regu |= doregbits(t);
655 	r->regu |= doregbits(a->index);
656 	et = a->etype;
657 
658 	switch(t) {
659 	default:
660 		goto none;
661 	case D_INDIR+D_GS:
662 		if(!isro || 1)
663 			goto none;
664 		n = t;
665 		{static Sym er; a->sym = &er;}
666 		a->sym->name = "$extreg";
667 		break;
668 	case D_ADDR:
669 		a->type = a->index;
670 		bit = mkvar(r, a, 0);
671 		for(z=0; z<BITS; z++)
672 			addrs.b[z] |= bit.b[z];
673 		a->type = t;
674 		goto none;
675 	case D_EXTERN:
676 	case D_STATIC:
677 	case D_PARAM:
678 	case D_AUTO:
679 		n = t;
680 		break;
681 	}
682 	s = a->sym;
683 	if(s == S)
684 		goto none;
685 	if(s->name[0] == '.')
686 		goto none;
687 	o = a->offset;
688 	v = var;
689 	for(i=0; i<nvar; i++) {
690 		if(s == v->sym)
691 		if(n == v->name)
692 		if(o == v->offset)
693 			goto out;
694 		v++;
695 	}
696 	if(nvar >= NVAR) {
697 		if(debug['w'] > 1 && s)
698 			warn(Z, "variable not optimized: %s", s->name);
699 		goto none;
700 	}
701 	i = nvar;
702 	nvar++;
703 	v = &var[i];
704 	v->sym = s;
705 	v->offset = o;
706 	v->name = n;
707 	v->etype = et;
708 	if(debug['R'])
709 		print("bit=%2d et=%2d %D\n", i, et, a);
710 
711 out:
712 	bit = blsh(i);
713 	if(n == D_EXTERN || n == D_STATIC)
714 		for(z=0; z<BITS; z++)
715 			externs.b[z] |= bit.b[z];
716 	if(n == D_PARAM)
717 		for(z=0; z<BITS; z++)
718 			params.b[z] |= bit.b[z];
719 	if(v->etype != et || !typechlpfd[et])	/* funny punning */
720 		for(z=0; z<BITS; z++)
721 			addrs.b[z] |= bit.b[z];
722 	return bit;
723 
724 none:
725 	return zbits;
726 }
727 
728 void
729 prop(Reg *r, Bits ref, Bits cal)
730 {
731 	Reg *r1, *r2;
732 	int z;
733 
734 	for(r1 = r; r1 != R; r1 = r1->p1) {
735 		for(z=0; z<BITS; z++) {
736 			ref.b[z] |= r1->refahead.b[z];
737 			if(ref.b[z] != r1->refahead.b[z]) {
738 				r1->refahead.b[z] = ref.b[z];
739 				change++;
740 			}
741 			cal.b[z] |= r1->calahead.b[z];
742 			if(cal.b[z] != r1->calahead.b[z]) {
743 				r1->calahead.b[z] = cal.b[z];
744 				change++;
745 			}
746 		}
747 		switch(r1->prog->as) {
748 		case ACALL:
749 			for(z=0; z<BITS; z++) {
750 				cal.b[z] |= ref.b[z] | externs.b[z];
751 				ref.b[z] = 0;
752 			}
753 			break;
754 
755 		case ATEXT:
756 			for(z=0; z<BITS; z++) {
757 				cal.b[z] = 0;
758 				ref.b[z] = 0;
759 			}
760 			break;
761 
762 		case ARET:
763 			for(z=0; z<BITS; z++) {
764 				cal.b[z] = externs.b[z];
765 				ref.b[z] = 0;
766 			}
767 		}
768 		for(z=0; z<BITS; z++) {
769 			ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
770 				r1->use1.b[z] | r1->use2.b[z];
771 			cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
772 			r1->refbehind.b[z] = ref.b[z];
773 			r1->calbehind.b[z] = cal.b[z];
774 		}
775 		if(r1->active)
776 			break;
777 		r1->active = 1;
778 	}
779 	for(; r != r1; r = r->p1)
780 		for(r2 = r->p2; r2 != R; r2 = r2->p2link)
781 			prop(r2, r->refbehind, r->calbehind);
782 }
783 
784 /*
785  * find looping structure
786  *
787  * 1) find reverse postordering
788  * 2) find approximate dominators,
789  *	the actual dominators if the flow graph is reducible
790  *	otherwise, dominators plus some other non-dominators.
791  *	See Matthew S. Hecht and Jeffrey D. Ullman,
792  *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
793  *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
794  *	Oct. 1-3, 1973, pp.  207-217.
795  * 3) find all nodes with a predecessor dominated by the current node.
796  *	such a node is a loop head.
797  *	recursively, all preds with a greater rpo number are in the loop
798  */
799 long
800 postorder(Reg *r, Reg **rpo2r, long n)
801 {
802 	Reg *r1;
803 
804 	r->rpo = 1;
805 	r1 = r->s1;
806 	if(r1 && !r1->rpo)
807 		n = postorder(r1, rpo2r, n);
808 	r1 = r->s2;
809 	if(r1 && !r1->rpo)
810 		n = postorder(r1, rpo2r, n);
811 	rpo2r[n] = r;
812 	n++;
813 	return n;
814 }
815 
816 long
817 rpolca(long *idom, long rpo1, long rpo2)
818 {
819 	long t;
820 
821 	if(rpo1 == -1)
822 		return rpo2;
823 	while(rpo1 != rpo2){
824 		if(rpo1 > rpo2){
825 			t = rpo2;
826 			rpo2 = rpo1;
827 			rpo1 = t;
828 		}
829 		while(rpo1 < rpo2){
830 			t = idom[rpo2];
831 			if(t >= rpo2)
832 				fatal(Z, "bad idom");
833 			rpo2 = t;
834 		}
835 	}
836 	return rpo1;
837 }
838 
839 int
840 doms(long *idom, long r, long s)
841 {
842 	while(s > r)
843 		s = idom[s];
844 	return s == r;
845 }
846 
847 int
848 loophead(long *idom, Reg *r)
849 {
850 	long src;
851 
852 	src = r->rpo;
853 	if(r->p1 != R && doms(idom, src, r->p1->rpo))
854 		return 1;
855 	for(r = r->p2; r != R; r = r->p2link)
856 		if(doms(idom, src, r->rpo))
857 			return 1;
858 	return 0;
859 }
860 
861 void
862 loopmark(Reg **rpo2r, long head, Reg *r)
863 {
864 	if(r->rpo < head || r->active == head)
865 		return;
866 	r->active = head;
867 	r->loop += LOOP;
868 	if(r->p1 != R)
869 		loopmark(rpo2r, head, r->p1);
870 	for(r = r->p2; r != R; r = r->p2link)
871 		loopmark(rpo2r, head, r);
872 }
873 
874 void
875 loopit(Reg *r, long nr)
876 {
877 	Reg *r1;
878 	long i, d, me;
879 
880 	if(nr > maxnr) {
881 		rpo2r = alloc(nr * sizeof(Reg*));
882 		idom = alloc(nr * sizeof(long));
883 		maxnr = nr;
884 	}
885 
886 	d = postorder(r, rpo2r, 0);
887 	if(d > nr)
888 		fatal(Z, "too many reg nodes");
889 	nr = d;
890 	for(i = 0; i < nr / 2; i++){
891 		r1 = rpo2r[i];
892 		rpo2r[i] = rpo2r[nr - 1 - i];
893 		rpo2r[nr - 1 - i] = r1;
894 	}
895 	for(i = 0; i < nr; i++)
896 		rpo2r[i]->rpo = i;
897 
898 	idom[0] = 0;
899 	for(i = 0; i < nr; i++){
900 		r1 = rpo2r[i];
901 		me = r1->rpo;
902 		d = -1;
903 		if(r1->p1 != R && r1->p1->rpo < me)
904 			d = r1->p1->rpo;
905 		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
906 			if(r1->rpo < me)
907 				d = rpolca(idom, d, r1->rpo);
908 		idom[i] = d;
909 	}
910 
911 	for(i = 0; i < nr; i++){
912 		r1 = rpo2r[i];
913 		r1->loop++;
914 		if(r1->p2 != R && loophead(idom, r1))
915 			loopmark(rpo2r, i, r1);
916 	}
917 }
918 
919 void
920 synch(Reg *r, Bits dif)
921 {
922 	Reg *r1;
923 	int z;
924 
925 	for(r1 = r; r1 != R; r1 = r1->s1) {
926 		for(z=0; z<BITS; z++) {
927 			dif.b[z] = (dif.b[z] &
928 				~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
929 					r1->set.b[z] | r1->regdiff.b[z];
930 			if(dif.b[z] != r1->regdiff.b[z]) {
931 				r1->regdiff.b[z] = dif.b[z];
932 				change++;
933 			}
934 		}
935 		if(r1->active)
936 			break;
937 		r1->active = 1;
938 		for(z=0; z<BITS; z++)
939 			dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
940 		if(r1->s2 != R)
941 			synch(r1->s2, dif);
942 	}
943 }
944 
945 ulong
946 allreg(ulong b, Rgn *r)
947 {
948 	Var *v;
949 	int i;
950 
951 	v = var + r->varno;
952 	r->regno = 0;
953 	switch(v->etype) {
954 
955 	default:
956 		diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
957 		break;
958 
959 	case TCHAR:
960 	case TUCHAR:
961 	case TSHORT:
962 	case TUSHORT:
963 	case TINT:
964 	case TUINT:
965 	case TLONG:
966 	case TULONG:
967 	case TIND:
968 	case TARRAY:
969 		i = BtoR(~b);
970 		if(i && r->cost > 0) {
971 			r->regno = i;
972 			return RtoB(i);
973 		}
974 		break;
975 
976 	case TDOUBLE:
977 	case TFLOAT:
978 		break;
979 	}
980 	return 0;
981 }
982 
983 void
984 paint1(Reg *r, int bn)
985 {
986 	Reg *r1;
987 	Prog *p;
988 	int z;
989 	ulong bb;
990 
991 	z = bn/32;
992 	bb = 1L<<(bn%32);
993 	if(r->act.b[z] & bb)
994 		return;
995 	for(;;) {
996 		if(!(r->refbehind.b[z] & bb))
997 			break;
998 		r1 = r->p1;
999 		if(r1 == R)
1000 			break;
1001 		if(!(r1->refahead.b[z] & bb))
1002 			break;
1003 		if(r1->act.b[z] & bb)
1004 			break;
1005 		r = r1;
1006 	}
1007 
1008 	if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1009 		change -= CLOAD * r->loop;
1010 		if(debug['R'] && debug['v'])
1011 			print("%ld%P\tld %B $%d\n", r->loop,
1012 				r->prog, blsh(bn), change);
1013 	}
1014 	for(;;) {
1015 		r->act.b[z] |= bb;
1016 		p = r->prog;
1017 
1018 		if(r->use1.b[z] & bb) {
1019 			change += CREF * r->loop;
1020 			if(p->as == AFMOVL || p->as == AFMOVW)
1021 				if(BtoR(bb) != D_F0)
1022 					change = -CINF;
1023 			if(debug['R'] && debug['v'])
1024 				print("%ld%P\tu1 %B $%d\n", r->loop,
1025 					p, blsh(bn), change);
1026 		}
1027 
1028 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1029 			change += CREF * r->loop;
1030 			if(p->as == AFMOVL || p->as == AFMOVW)
1031 				if(BtoR(bb) != D_F0)
1032 					change = -CINF;
1033 			if(debug['R'] && debug['v'])
1034 				print("%ld%P\tu2 %B $%d\n", r->loop,
1035 					p, blsh(bn), change);
1036 		}
1037 
1038 		if(STORE(r) & r->regdiff.b[z] & bb) {
1039 			change -= CLOAD * r->loop;
1040 			if(p->as == AFMOVL || p->as == AFMOVW)
1041 				if(BtoR(bb) != D_F0)
1042 					change = -CINF;
1043 			if(debug['R'] && debug['v'])
1044 				print("%ld%P\tst %B $%d\n", r->loop,
1045 					p, blsh(bn), change);
1046 		}
1047 
1048 		if(r->refbehind.b[z] & bb)
1049 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1050 				if(r1->refahead.b[z] & bb)
1051 					paint1(r1, bn);
1052 
1053 		if(!(r->refahead.b[z] & bb))
1054 			break;
1055 		r1 = r->s2;
1056 		if(r1 != R)
1057 			if(r1->refbehind.b[z] & bb)
1058 				paint1(r1, bn);
1059 		r = r->s1;
1060 		if(r == R)
1061 			break;
1062 		if(r->act.b[z] & bb)
1063 			break;
1064 		if(!(r->refbehind.b[z] & bb))
1065 			break;
1066 	}
1067 }
1068 
1069 ulong
1070 regset(Reg *r, ulong bb)
1071 {
1072 	ulong b, set;
1073 	Adr v;
1074 	int c;
1075 
1076 	set = 0;
1077 	v = zprog.from;
1078 	while(b = bb & ~(bb-1)) {
1079 		v.type = BtoR(b);
1080 		c = copyu(r->prog, &v, A);
1081 		if(c == 3)
1082 			set |= b;
1083 		bb &= ~b;
1084 	}
1085 	return set;
1086 }
1087 
1088 ulong
1089 reguse(Reg *r, ulong bb)
1090 {
1091 	ulong b, set;
1092 	Adr v;
1093 	int c;
1094 
1095 	set = 0;
1096 	v = zprog.from;
1097 	while(b = bb & ~(bb-1)) {
1098 		v.type = BtoR(b);
1099 		c = copyu(r->prog, &v, A);
1100 		if(c == 1 || c == 2 || c == 4)
1101 			set |= b;
1102 		bb &= ~b;
1103 	}
1104 	return set;
1105 }
1106 
1107 ulong
1108 paint2(Reg *r, int bn)
1109 {
1110 	Reg *r1;
1111 	int z;
1112 	ulong bb, vreg, x;
1113 
1114 	z = bn/32;
1115 	bb = 1L << (bn%32);
1116 	vreg = regbits;
1117 	if(!(r->act.b[z] & bb))
1118 		return vreg;
1119 	for(;;) {
1120 		if(!(r->refbehind.b[z] & bb))
1121 			break;
1122 		r1 = r->p1;
1123 		if(r1 == R)
1124 			break;
1125 		if(!(r1->refahead.b[z] & bb))
1126 			break;
1127 		if(!(r1->act.b[z] & bb))
1128 			break;
1129 		r = r1;
1130 	}
1131 	for(;;) {
1132 		r->act.b[z] &= ~bb;
1133 
1134 		vreg |= r->regu;
1135 
1136 		if(r->refbehind.b[z] & bb)
1137 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1138 				if(r1->refahead.b[z] & bb)
1139 					vreg |= paint2(r1, bn);
1140 
1141 		if(!(r->refahead.b[z] & bb))
1142 			break;
1143 		r1 = r->s2;
1144 		if(r1 != R)
1145 			if(r1->refbehind.b[z] & bb)
1146 				vreg |= paint2(r1, bn);
1147 		r = r->s1;
1148 		if(r == R)
1149 			break;
1150 		if(!(r->act.b[z] & bb))
1151 			break;
1152 		if(!(r->refbehind.b[z] & bb))
1153 			break;
1154 	}
1155 
1156 	bb = vreg;
1157 	for(; r; r=r->s1) {
1158 		x = r->regu & ~bb;
1159 		if(x) {
1160 			vreg |= reguse(r, x);
1161 			bb |= regset(r, x);
1162 		}
1163 	}
1164 	return vreg;
1165 }
1166 
1167 void
1168 paint3(Reg *r, int bn, long rb, int rn)
1169 {
1170 	Reg *r1;
1171 	Prog *p;
1172 	int z;
1173 	ulong bb;
1174 
1175 	z = bn/32;
1176 	bb = 1L << (bn%32);
1177 	if(r->act.b[z] & bb)
1178 		return;
1179 	for(;;) {
1180 		if(!(r->refbehind.b[z] & bb))
1181 			break;
1182 		r1 = r->p1;
1183 		if(r1 == R)
1184 			break;
1185 		if(!(r1->refahead.b[z] & bb))
1186 			break;
1187 		if(r1->act.b[z] & bb)
1188 			break;
1189 		r = r1;
1190 	}
1191 
1192 	if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1193 		addmove(r, bn, rn, 0);
1194 	for(;;) {
1195 		r->act.b[z] |= bb;
1196 		p = r->prog;
1197 
1198 		if(r->use1.b[z] & bb) {
1199 			if(debug['R'])
1200 				print("%P", p);
1201 			addreg(&p->from, rn);
1202 			if(debug['R'])
1203 				print("\t.c%P\n", p);
1204 		}
1205 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1206 			if(debug['R'])
1207 				print("%P", p);
1208 			addreg(&p->to, rn);
1209 			if(debug['R'])
1210 				print("\t.c%P\n", p);
1211 		}
1212 
1213 		if(STORE(r) & r->regdiff.b[z] & bb)
1214 			addmove(r, bn, rn, 1);
1215 		r->regu |= rb;
1216 
1217 		if(r->refbehind.b[z] & bb)
1218 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1219 				if(r1->refahead.b[z] & bb)
1220 					paint3(r1, bn, rb, rn);
1221 
1222 		if(!(r->refahead.b[z] & bb))
1223 			break;
1224 		r1 = r->s2;
1225 		if(r1 != R)
1226 			if(r1->refbehind.b[z] & bb)
1227 				paint3(r1, bn, rb, rn);
1228 		r = r->s1;
1229 		if(r == R)
1230 			break;
1231 		if(r->act.b[z] & bb)
1232 			break;
1233 		if(!(r->refbehind.b[z] & bb))
1234 			break;
1235 	}
1236 }
1237 
1238 void
1239 addreg(Adr *a, int rn)
1240 {
1241 
1242 	a->sym = 0;
1243 	a->offset = 0;
1244 	a->type = rn;
1245 }
1246 
1247 long
1248 RtoB(int r)
1249 {
1250 
1251 	if(r < D_AX || r > D_DI)
1252 		return 0;
1253 	return 1L << (r-D_AX);
1254 }
1255 
1256 int
1257 BtoR(long b)
1258 {
1259 
1260 	b &= 0xffL;
1261 	if(b == 0)
1262 		return 0;
1263 	return bitno(b) + D_AX;
1264 }
1265