xref: /inferno-os/utils/6c/reg.c (revision 50b0dbb170df61467e42c7ea4deb0b5692d15f4c)
1 #include "gc.h"
2 
3 Reg*
4 rega(void)
5 {
6 	Reg *r;
7 
8 	r = freer;
9 	if(r == R) {
10 		r = alloc(sizeof(*r));
11 	} else
12 		freer = r->link;
13 
14 	*r = zreg;
15 	return r;
16 }
17 
18 int
19 rcmp(void *a1, void *a2)
20 {
21 	Rgn *p1, *p2;
22 	int c1, c2;
23 
24 	p1 = (Rgn*)a1;
25 	p2 = (Rgn*)a2;
26 	c1 = p2->cost;
27 	c2 = p1->cost;
28 	if(c1 -= c2)
29 		return c1;
30 	return p2->varno - p1->varno;
31 }
32 
33 void
34 regopt(Prog *p)
35 {
36 	Reg *r, *r1, *r2;
37 	Prog *p1;
38 	int i, z;
39 	long initpc, val, npc;
40 	ulong vreg;
41 	Bits bit;
42 	struct
43 	{
44 		long	m;
45 		long	c;
46 		Reg*	p;
47 	} log5[6], *lp;
48 
49 	firstr = R;
50 	lastr = R;
51 	nvar = 0;
52 	regbits = RtoB(D_SP) | RtoB(D_AX) | RtoB(D_X0);
53 	if(REGEXT)
54 		regbits |= RtoB(REGEXT) | RtoB(REGEXT-1);
55 	for(z=0; z<BITS; z++) {
56 		externs.b[z] = 0;
57 		params.b[z] = 0;
58 		consts.b[z] = 0;
59 		addrs.b[z] = 0;
60 	}
61 
62 	/*
63 	 * pass 1
64 	 * build aux data structure
65 	 * allocate pcs
66 	 * find use and set of variables
67 	 */
68 	val = 5L * 5L * 5L * 5L * 5L;
69 	lp = log5;
70 	for(i=0; i<5; i++) {
71 		lp->m = val;
72 		lp->c = 0;
73 		lp->p = R;
74 		val /= 5L;
75 		lp++;
76 	}
77 	val = 0;
78 	for(; p != P; p = p->link) {
79 		switch(p->as) {
80 		case ADATA:
81 		case AGLOBL:
82 		case ANAME:
83 		case ASIGNAME:
84 			continue;
85 		}
86 		r = rega();
87 		if(firstr == R) {
88 			firstr = r;
89 			lastr = r;
90 		} else {
91 			lastr->link = r;
92 			r->p1 = lastr;
93 			lastr->s1 = r;
94 			lastr = r;
95 		}
96 		r->prog = p;
97 		r->pc = val;
98 		val++;
99 
100 		lp = log5;
101 		for(i=0; i<5; i++) {
102 			lp->c--;
103 			if(lp->c <= 0) {
104 				lp->c = lp->m;
105 				if(lp->p != R)
106 					lp->p->log5 = r;
107 				lp->p = r;
108 				(lp+1)->c = 0;
109 				break;
110 			}
111 			lp++;
112 		}
113 
114 		r1 = r->p1;
115 		if(r1 != R)
116 		switch(r1->prog->as) {
117 		case ARET:
118 		case AJMP:
119 		case AIRETL:
120 		case AIRETQ:
121 			r->p1 = R;
122 			r1->s1 = R;
123 		}
124 
125 		bit = mkvar(r, &p->from);
126 		if(bany(&bit))
127 		switch(p->as) {
128 		/*
129 		 * funny
130 		 */
131 		case ALEAL:
132 		case ALEAQ:
133 			for(z=0; z<BITS; z++)
134 				addrs.b[z] |= bit.b[z];
135 			break;
136 
137 		/*
138 		 * left side read
139 		 */
140 		default:
141 			for(z=0; z<BITS; z++)
142 				r->use1.b[z] |= bit.b[z];
143 			break;
144 		}
145 
146 		bit = mkvar(r, &p->to);
147 		if(bany(&bit))
148 		switch(p->as) {
149 		default:
150 			diag(Z, "reg: unknown op: %A", p->as);
151 			break;
152 
153 		/*
154 		 * right side read
155 		 */
156 		case ACMPB:
157 		case ACMPL:
158 		case ACMPQ:
159 		case ACMPW:
160 		case ACOMISS:
161 		case ACOMISD:
162 		case AUCOMISS:
163 		case AUCOMISD:
164 			for(z=0; z<BITS; z++)
165 				r->use2.b[z] |= bit.b[z];
166 			break;
167 
168 		/*
169 		 * right side write
170 		 */
171 		case ANOP:
172 		case AMOVL:
173 		case AMOVQ:
174 		case AMOVB:
175 		case AMOVW:
176 		case AMOVBLSX:
177 		case AMOVBLZX:
178 		case AMOVBQSX:
179 		case AMOVBQZX:
180 		case AMOVLQSX:
181 		case AMOVLQZX:
182 		case AMOVWLSX:
183 		case AMOVWLZX:
184 		case AMOVWQSX:
185 		case AMOVWQZX:
186 
187 		case AMOVSS:
188 		case AMOVSD:
189 		case ACVTSD2SL:
190 		case ACVTSD2SQ:
191 		case ACVTSD2SS:
192 		case ACVTSL2SD:
193 		case ACVTSL2SS:
194 		case ACVTSQ2SD:
195 		case ACVTSQ2SS:
196 		case ACVTSS2SD:
197 		case ACVTSS2SL:
198 		case ACVTSS2SQ:
199 		case ACVTTSD2SL:
200 		case ACVTTSD2SQ:
201 		case ACVTTSS2SL:
202 		case ACVTTSS2SQ:
203 			for(z=0; z<BITS; z++)
204 				r->set.b[z] |= bit.b[z];
205 			break;
206 
207 		/*
208 		 * right side read+write
209 		 */
210 		case AADDB:
211 		case AADDL:
212 		case AADDQ:
213 		case AADDW:
214 		case AANDB:
215 		case AANDL:
216 		case AANDQ:
217 		case AANDW:
218 		case ASUBB:
219 		case ASUBL:
220 		case ASUBQ:
221 		case ASUBW:
222 		case AORB:
223 		case AORL:
224 		case AORQ:
225 		case AORW:
226 		case AXORB:
227 		case AXORL:
228 		case AXORQ:
229 		case AXORW:
230 		case ASALB:
231 		case ASALL:
232 		case ASALQ:
233 		case ASALW:
234 		case ASARB:
235 		case ASARL:
236 		case ASARQ:
237 		case ASARW:
238 		case AROLB:
239 		case AROLL:
240 		case AROLQ:
241 		case AROLW:
242 		case ARORB:
243 		case ARORL:
244 		case ARORQ:
245 		case ARORW:
246 		case ASHLB:
247 		case ASHLL:
248 		case ASHLQ:
249 		case ASHLW:
250 		case ASHRB:
251 		case ASHRL:
252 		case ASHRQ:
253 		case ASHRW:
254 		case AIMULL:
255 		case AIMULQ:
256 		case AIMULW:
257 		case ANEGL:
258 		case ANEGQ:
259 		case ANOTL:
260 		case ANOTQ:
261 		case AADCL:
262 		case AADCQ:
263 		case ASBBL:
264 		case ASBBQ:
265 
266 		case AADDSD:
267 		case AADDSS:
268 		case ACMPSD:
269 		case ACMPSS:
270 		case ADIVSD:
271 		case ADIVSS:
272 		case AMAXSD:
273 		case AMAXSS:
274 		case AMINSD:
275 		case AMINSS:
276 		case AMULSD:
277 		case AMULSS:
278 		case ARCPSS:
279 		case ARSQRTSS:
280 		case ASQRTSD:
281 		case ASQRTSS:
282 		case ASUBSD:
283 		case ASUBSS:
284 		case AXORPD:
285 			for(z=0; z<BITS; z++) {
286 				r->set.b[z] |= bit.b[z];
287 				r->use2.b[z] |= bit.b[z];
288 			}
289 			break;
290 
291 		/*
292 		 * funny
293 		 */
294 		case ACALL:
295 			for(z=0; z<BITS; z++)
296 				addrs.b[z] |= bit.b[z];
297 			break;
298 		}
299 
300 		switch(p->as) {
301 		case AIMULL:
302 		case AIMULQ:
303 		case AIMULW:
304 			if(p->to.type != D_NONE)
305 				break;
306 
307 		case AIDIVB:
308 		case AIDIVL:
309 		case AIDIVQ:
310 		case AIDIVW:
311 		case AIMULB:
312 		case ADIVB:
313 		case ADIVL:
314 		case ADIVQ:
315 		case ADIVW:
316 		case AMULB:
317 		case AMULL:
318 		case AMULQ:
319 		case AMULW:
320 
321 		case ACWD:
322 		case ACDQ:
323 		case ACQO:
324 			r->regu |= RtoB(D_AX) | RtoB(D_DX);
325 			break;
326 
327 		case AREP:
328 		case AREPN:
329 		case ALOOP:
330 		case ALOOPEQ:
331 		case ALOOPNE:
332 			r->regu |= RtoB(D_CX);
333 			break;
334 
335 		case AMOVSB:
336 		case AMOVSL:
337 		case AMOVSQ:
338 		case AMOVSW:
339 		case ACMPSB:
340 		case ACMPSL:
341 		case ACMPSQ:
342 		case ACMPSW:
343 			r->regu |= RtoB(D_SI) | RtoB(D_DI);
344 			break;
345 
346 		case ASTOSB:
347 		case ASTOSL:
348 		case ASTOSQ:
349 		case ASTOSW:
350 		case ASCASB:
351 		case ASCASL:
352 		case ASCASQ:
353 		case ASCASW:
354 			r->regu |= RtoB(D_AX) | RtoB(D_DI);
355 			break;
356 
357 		case AINSB:
358 		case AINSL:
359 		case AINSW:
360 		case AOUTSB:
361 		case AOUTSL:
362 		case AOUTSW:
363 			r->regu |= RtoB(D_DI) | RtoB(D_DX);
364 			break;
365 		}
366 	}
367 	if(firstr == R)
368 		return;
369 	initpc = pc - val;
370 	npc = val;
371 
372 	/*
373 	 * pass 2
374 	 * turn branch references to pointers
375 	 * build back pointers
376 	 */
377 	for(r = firstr; r != R; r = r->link) {
378 		p = r->prog;
379 		if(p->to.type == D_BRANCH) {
380 			val = p->to.offset - initpc;
381 			r1 = firstr;
382 			while(r1 != R) {
383 				r2 = r1->log5;
384 				if(r2 != R && val >= r2->pc) {
385 					r1 = r2;
386 					continue;
387 				}
388 				if(r1->pc == val)
389 					break;
390 				r1 = r1->link;
391 			}
392 			if(r1 == R) {
393 				nearln = p->lineno;
394 				diag(Z, "ref not found\n%P", p);
395 				continue;
396 			}
397 			if(r1 == r) {
398 				nearln = p->lineno;
399 				diag(Z, "ref to self\n%P", p);
400 				continue;
401 			}
402 			r->s2 = r1;
403 			r->p2link = r1->p2;
404 			r1->p2 = r;
405 		}
406 	}
407 	if(debug['R']) {
408 		p = firstr->prog;
409 		print("\n%L %D\n", p->lineno, &p->from);
410 	}
411 
412 	/*
413 	 * pass 2.5
414 	 * find looping structure
415 	 */
416 	for(r = firstr; r != R; r = r->link)
417 		r->active = 0;
418 	change = 0;
419 	loopit(firstr, npc);
420 	if(debug['R'] && debug['v']) {
421 		print("\nlooping structure:\n");
422 		for(r = firstr; r != R; r = r->link) {
423 			print("%ld:%P", r->loop, r->prog);
424 			for(z=0; z<BITS; z++)
425 				bit.b[z] = r->use1.b[z] |
426 					   r->use2.b[z] |
427 					   r->set.b[z];
428 			if(bany(&bit)) {
429 				print("\t");
430 				if(bany(&r->use1))
431 					print(" u1=%B", r->use1);
432 				if(bany(&r->use2))
433 					print(" u2=%B", r->use2);
434 				if(bany(&r->set))
435 					print(" st=%B", r->set);
436 			}
437 			print("\n");
438 		}
439 	}
440 
441 	/*
442 	 * pass 3
443 	 * iterate propagating usage
444 	 * 	back until flow graph is complete
445 	 */
446 loop1:
447 	change = 0;
448 	for(r = firstr; r != R; r = r->link)
449 		r->active = 0;
450 	for(r = firstr; r != R; r = r->link)
451 		if(r->prog->as == ARET)
452 			prop(r, zbits, zbits);
453 loop11:
454 	/* pick up unreachable code */
455 	i = 0;
456 	for(r = firstr; r != R; r = r1) {
457 		r1 = r->link;
458 		if(r1 && r1->active && !r->active) {
459 			prop(r, zbits, zbits);
460 			i = 1;
461 		}
462 	}
463 	if(i)
464 		goto loop11;
465 	if(change)
466 		goto loop1;
467 
468 
469 	/*
470 	 * pass 4
471 	 * iterate propagating register/variable synchrony
472 	 * 	forward until graph is complete
473 	 */
474 loop2:
475 	change = 0;
476 	for(r = firstr; r != R; r = r->link)
477 		r->active = 0;
478 	synch(firstr, zbits);
479 	if(change)
480 		goto loop2;
481 
482 
483 	/*
484 	 * pass 5
485 	 * isolate regions
486 	 * calculate costs (paint1)
487 	 */
488 	r = firstr;
489 	if(r) {
490 		for(z=0; z<BITS; z++)
491 			bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
492 			  ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
493 		if(bany(&bit)) {
494 			nearln = r->prog->lineno;
495 			warn(Z, "used and not set: %B", bit);
496 			if(debug['R'] && !debug['w'])
497 				print("used and not set: %B\n", bit);
498 		}
499 	}
500 	if(debug['R'] && debug['v'])
501 		print("\nprop structure:\n");
502 	for(r = firstr; r != R; r = r->link)
503 		r->act = zbits;
504 	rgp = region;
505 	nregion = 0;
506 	for(r = firstr; r != R; r = r->link) {
507 		if(debug['R'] && debug['v']) {
508 			print("%P\t", r->prog);
509 			if(bany(&r->set))
510 				print("s:%B ", r->set);
511 			if(bany(&r->refahead))
512 				print("ra:%B ", r->refahead);
513 			if(bany(&r->calahead))
514 				print("ca:%B ", r->calahead);
515 			print("\n");
516 		}
517 		for(z=0; z<BITS; z++)
518 			bit.b[z] = r->set.b[z] &
519 			  ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
520 		if(bany(&bit)) {
521 			nearln = r->prog->lineno;
522 			warn(Z, "set and not used: %B", bit);
523 			if(debug['R'])
524 				print("set and not used: %B\n", bit);
525 			excise(r);
526 		}
527 		for(z=0; z<BITS; z++)
528 			bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
529 		while(bany(&bit)) {
530 			i = bnum(bit);
531 			rgp->enter = r;
532 			rgp->varno = i;
533 			change = 0;
534 			if(debug['R'] && debug['v'])
535 				print("\n");
536 			paint1(r, i);
537 			bit.b[i/32] &= ~(1L<<(i%32));
538 			if(change <= 0) {
539 				if(debug['R'])
540 					print("%L$%d: %B\n",
541 						r->prog->lineno, change, blsh(i));
542 				continue;
543 			}
544 			rgp->cost = change;
545 			nregion++;
546 			if(nregion >= NRGN) {
547 				warn(Z, "too many regions");
548 				goto brk;
549 			}
550 			rgp++;
551 		}
552 	}
553 brk:
554 	qsort(region, nregion, sizeof(region[0]), rcmp);
555 
556 	/*
557 	 * pass 6
558 	 * determine used registers (paint2)
559 	 * replace code (paint3)
560 	 */
561 	rgp = region;
562 	for(i=0; i<nregion; i++) {
563 		bit = blsh(rgp->varno);
564 		vreg = paint2(rgp->enter, rgp->varno);
565 		vreg = allreg(vreg, rgp);
566 		if(debug['R']) {
567 			print("%L$%d %R: %B\n",
568 				rgp->enter->prog->lineno,
569 				rgp->cost,
570 				rgp->regno,
571 				bit);
572 		}
573 		if(rgp->regno != 0)
574 			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
575 		rgp++;
576 	}
577 	/*
578 	 * pass 7
579 	 * peep-hole on basic block
580 	 */
581 	if(!debug['R'] || debug['P'])
582 		peep();
583 
584 	/*
585 	 * pass 8
586 	 * recalculate pc
587 	 */
588 	val = initpc;
589 	for(r = firstr; r != R; r = r1) {
590 		r->pc = val;
591 		p = r->prog;
592 		p1 = P;
593 		r1 = r->link;
594 		if(r1 != R)
595 			p1 = r1->prog;
596 		for(; p != p1; p = p->link) {
597 			switch(p->as) {
598 			default:
599 				val++;
600 				break;
601 
602 			case ANOP:
603 			case ADATA:
604 			case AGLOBL:
605 			case ANAME:
606 			case ASIGNAME:
607 				break;
608 			}
609 		}
610 	}
611 	pc = val;
612 
613 	/*
614 	 * fix up branches
615 	 */
616 	if(debug['R'])
617 		if(bany(&addrs))
618 			print("addrs: %B\n", addrs);
619 
620 	r1 = 0; /* set */
621 	for(r = firstr; r != R; r = r->link) {
622 		p = r->prog;
623 		if(p->to.type == D_BRANCH)
624 			p->to.offset = r->s2->pc;
625 		r1 = r;
626 	}
627 
628 	/*
629 	 * last pass
630 	 * eliminate nops
631 	 * free aux structures
632 	 */
633 	for(p = firstr->prog; p != P; p = p->link){
634 		while(p->link && p->link->as == ANOP)
635 			p->link = p->link->link;
636 	}
637 	if(r1 != R) {
638 		r1->link = freer;
639 		freer = firstr;
640 	}
641 }
642 
643 /*
644  * add mov b,rn
645  * just after r
646  */
647 void
648 addmove(Reg *r, int bn, int rn, int f)
649 {
650 	Prog *p, *p1;
651 	Adr *a;
652 	Var *v;
653 
654 	p1 = alloc(sizeof(*p1));
655 	*p1 = zprog;
656 	p = r->prog;
657 
658 	p1->link = p->link;
659 	p->link = p1;
660 	p1->lineno = p->lineno;
661 
662 	v = var + bn;
663 
664 	a = &p1->to;
665 	a->sym = v->sym;
666 	a->offset = v->offset;
667 	a->etype = v->etype;
668 	a->type = v->name;
669 
670 	p1->as = AMOVL;
671 	if(v->etype == TCHAR || v->etype == TUCHAR)
672 		p1->as = AMOVB;
673 	if(v->etype == TSHORT || v->etype == TUSHORT)
674 		p1->as = AMOVW;
675 	if(v->etype == TVLONG || v->etype == TUVLONG || v->etype == TIND)
676 		p1->as = AMOVQ;
677 	if(v->etype == TFLOAT)
678 		p1->as = AMOVSS;
679 	if(v->etype == TDOUBLE)
680 		p1->as = AMOVSD;
681 
682 	p1->from.type = rn;
683 	if(!f) {
684 		p1->from = *a;
685 		*a = zprog.from;
686 		a->type = rn;
687 		if(v->etype == TUCHAR)
688 			p1->as = AMOVB;
689 		if(v->etype == TUSHORT)
690 			p1->as = AMOVW;
691 	}
692 	if(debug['R'])
693 		print("%P\t.a%P\n", p, p1);
694 }
695 
696 ulong
697 doregbits(int r)
698 {
699 	ulong b;
700 
701 	b = 0;
702 	if(r >= D_INDIR)
703 		r -= D_INDIR;
704 	if(r >= D_AX && r <= D_R15)
705 		b |= RtoB(r);
706 	else
707 	if(r >= D_AL && r <= D_R15B)
708 		b |= RtoB(r-D_AL+D_AX);
709 	else
710 	if(r >= D_AH && r <= D_BH)
711 		b |= RtoB(r-D_AH+D_AX);
712 	else
713 	if(r >= D_X0 && r <= D_X0+15)
714 		b |= FtoB(r);
715 	return b;
716 }
717 
718 Bits
719 mkvar(Reg *r, Adr *a)
720 {
721 	Var *v;
722 	int i, t, n, et, z;
723 	long o;
724 	Bits bit;
725 	Sym *s;
726 
727 	/*
728 	 * mark registers used
729 	 */
730 	t = a->type;
731 	r->regu |= doregbits(t);
732 	r->regu |= doregbits(a->index);
733 
734 	switch(t) {
735 	default:
736 		goto none;
737 	case D_ADDR:
738 		a->type = a->index;
739 		bit = mkvar(r, a);
740 		for(z=0; z<BITS; z++)
741 			addrs.b[z] |= bit.b[z];
742 		a->type = t;
743 		goto none;
744 	case D_EXTERN:
745 	case D_STATIC:
746 	case D_PARAM:
747 	case D_AUTO:
748 		n = t;
749 		break;
750 	}
751 	s = a->sym;
752 	if(s == S)
753 		goto none;
754 	if(s->name[0] == '.')
755 		goto none;
756 	et = a->etype;
757 	o = a->offset;
758 	v = var;
759 	for(i=0; i<nvar; i++) {
760 		if(s == v->sym)
761 		if(n == v->name)
762 		if(o == v->offset)
763 			goto out;
764 		v++;
765 	}
766 	if(nvar >= NVAR) {
767 		if(debug['w'] > 1 && s)
768 			warn(Z, "variable not optimized: %s", s->name);
769 		goto none;
770 	}
771 	i = nvar;
772 	nvar++;
773 	v = &var[i];
774 	v->sym = s;
775 	v->offset = o;
776 	v->name = n;
777 	v->etype = et;
778 	if(debug['R'])
779 		print("bit=%2d et=%2d %D\n", i, et, a);
780 
781 out:
782 	bit = blsh(i);
783 	if(n == D_EXTERN || n == D_STATIC)
784 		for(z=0; z<BITS; z++)
785 			externs.b[z] |= bit.b[z];
786 	if(n == D_PARAM)
787 		for(z=0; z<BITS; z++)
788 			params.b[z] |= bit.b[z];
789 	if(v->etype != et || !(typechlpfd[et] || typev[et]))	/* funny punning */
790 		for(z=0; z<BITS; z++)
791 			addrs.b[z] |= bit.b[z];
792 	return bit;
793 
794 none:
795 	return zbits;
796 }
797 
798 void
799 prop(Reg *r, Bits ref, Bits cal)
800 {
801 	Reg *r1, *r2;
802 	int z;
803 
804 	for(r1 = r; r1 != R; r1 = r1->p1) {
805 		for(z=0; z<BITS; z++) {
806 			ref.b[z] |= r1->refahead.b[z];
807 			if(ref.b[z] != r1->refahead.b[z]) {
808 				r1->refahead.b[z] = ref.b[z];
809 				change++;
810 			}
811 			cal.b[z] |= r1->calahead.b[z];
812 			if(cal.b[z] != r1->calahead.b[z]) {
813 				r1->calahead.b[z] = cal.b[z];
814 				change++;
815 			}
816 		}
817 		switch(r1->prog->as) {
818 		case ACALL:
819 			for(z=0; z<BITS; z++) {
820 				cal.b[z] |= ref.b[z] | externs.b[z];
821 				ref.b[z] = 0;
822 			}
823 			break;
824 
825 		case ATEXT:
826 			for(z=0; z<BITS; z++) {
827 				cal.b[z] = 0;
828 				ref.b[z] = 0;
829 			}
830 			break;
831 
832 		case ARET:
833 			for(z=0; z<BITS; z++) {
834 				cal.b[z] = externs.b[z];
835 				ref.b[z] = 0;
836 			}
837 		}
838 		for(z=0; z<BITS; z++) {
839 			ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
840 				r1->use1.b[z] | r1->use2.b[z];
841 			cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
842 			r1->refbehind.b[z] = ref.b[z];
843 			r1->calbehind.b[z] = cal.b[z];
844 		}
845 		if(r1->active)
846 			break;
847 		r1->active = 1;
848 	}
849 	for(; r != r1; r = r->p1)
850 		for(r2 = r->p2; r2 != R; r2 = r2->p2link)
851 			prop(r2, r->refbehind, r->calbehind);
852 }
853 
854 /*
855  * find looping structure
856  *
857  * 1) find reverse postordering
858  * 2) find approximate dominators,
859  *	the actual dominators if the flow graph is reducible
860  *	otherwise, dominators plus some other non-dominators.
861  *	See Matthew S. Hecht and Jeffrey D. Ullman,
862  *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
863  *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
864  *	Oct. 1-3, 1973, pp.  207-217.
865  * 3) find all nodes with a predecessor dominated by the current node.
866  *	such a node is a loop head.
867  *	recursively, all preds with a greater rpo number are in the loop
868  */
869 long
870 postorder(Reg *r, Reg **rpo2r, long n)
871 {
872 	Reg *r1;
873 
874 	r->rpo = 1;
875 	r1 = r->s1;
876 	if(r1 && !r1->rpo)
877 		n = postorder(r1, rpo2r, n);
878 	r1 = r->s2;
879 	if(r1 && !r1->rpo)
880 		n = postorder(r1, rpo2r, n);
881 	rpo2r[n] = r;
882 	n++;
883 	return n;
884 }
885 
886 long
887 rpolca(long *idom, long rpo1, long rpo2)
888 {
889 	long t;
890 
891 	if(rpo1 == -1)
892 		return rpo2;
893 	while(rpo1 != rpo2){
894 		if(rpo1 > rpo2){
895 			t = rpo2;
896 			rpo2 = rpo1;
897 			rpo1 = t;
898 		}
899 		while(rpo1 < rpo2){
900 			t = idom[rpo2];
901 			if(t >= rpo2)
902 				fatal(Z, "bad idom");
903 			rpo2 = t;
904 		}
905 	}
906 	return rpo1;
907 }
908 
909 int
910 doms(long *idom, long r, long s)
911 {
912 	while(s > r)
913 		s = idom[s];
914 	return s == r;
915 }
916 
917 int
918 loophead(long *idom, Reg *r)
919 {
920 	long src;
921 
922 	src = r->rpo;
923 	if(r->p1 != R && doms(idom, src, r->p1->rpo))
924 		return 1;
925 	for(r = r->p2; r != R; r = r->p2link)
926 		if(doms(idom, src, r->rpo))
927 			return 1;
928 	return 0;
929 }
930 
931 void
932 loopmark(Reg **rpo2r, long head, Reg *r)
933 {
934 	if(r->rpo < head || r->active == head)
935 		return;
936 	r->active = head;
937 	r->loop += LOOP;
938 	if(r->p1 != R)
939 		loopmark(rpo2r, head, r->p1);
940 	for(r = r->p2; r != R; r = r->p2link)
941 		loopmark(rpo2r, head, r);
942 }
943 
944 void
945 loopit(Reg *r, long nr)
946 {
947 	Reg *r1;
948 	long i, d, me;
949 
950 	if(nr > maxnr) {
951 		rpo2r = alloc(nr * sizeof(Reg*));
952 		idom = alloc(nr * sizeof(long));
953 		maxnr = nr;
954 	}
955 
956 	d = postorder(r, rpo2r, 0);
957 	if(d > nr)
958 		fatal(Z, "too many reg nodes");
959 	nr = d;
960 	for(i = 0; i < nr / 2; i++){
961 		r1 = rpo2r[i];
962 		rpo2r[i] = rpo2r[nr - 1 - i];
963 		rpo2r[nr - 1 - i] = r1;
964 	}
965 	for(i = 0; i < nr; i++)
966 		rpo2r[i]->rpo = i;
967 
968 	idom[0] = 0;
969 	for(i = 0; i < nr; i++){
970 		r1 = rpo2r[i];
971 		me = r1->rpo;
972 		d = -1;
973 		if(r1->p1 != R && r1->p1->rpo < me)
974 			d = r1->p1->rpo;
975 		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
976 			if(r1->rpo < me)
977 				d = rpolca(idom, d, r1->rpo);
978 		idom[i] = d;
979 	}
980 
981 	for(i = 0; i < nr; i++){
982 		r1 = rpo2r[i];
983 		r1->loop++;
984 		if(r1->p2 != R && loophead(idom, r1))
985 			loopmark(rpo2r, i, r1);
986 	}
987 }
988 
989 void
990 synch(Reg *r, Bits dif)
991 {
992 	Reg *r1;
993 	int z;
994 
995 	for(r1 = r; r1 != R; r1 = r1->s1) {
996 		for(z=0; z<BITS; z++) {
997 			dif.b[z] = (dif.b[z] &
998 				~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
999 					r1->set.b[z] | r1->regdiff.b[z];
1000 			if(dif.b[z] != r1->regdiff.b[z]) {
1001 				r1->regdiff.b[z] = dif.b[z];
1002 				change++;
1003 			}
1004 		}
1005 		if(r1->active)
1006 			break;
1007 		r1->active = 1;
1008 		for(z=0; z<BITS; z++)
1009 			dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
1010 		if(r1->s2 != R)
1011 			synch(r1->s2, dif);
1012 	}
1013 }
1014 
1015 ulong
1016 allreg(ulong b, Rgn *r)
1017 {
1018 	Var *v;
1019 	int i;
1020 
1021 	v = var + r->varno;
1022 	r->regno = 0;
1023 	switch(v->etype) {
1024 
1025 	default:
1026 		diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
1027 		break;
1028 
1029 	case TCHAR:
1030 	case TUCHAR:
1031 	case TSHORT:
1032 	case TUSHORT:
1033 	case TINT:
1034 	case TUINT:
1035 	case TLONG:
1036 	case TULONG:
1037 	case TVLONG:
1038 	case TUVLONG:
1039 	case TIND:
1040 	case TARRAY:
1041 		i = BtoR(~b);
1042 		if(i && r->cost > 0) {
1043 			r->regno = i;
1044 			return RtoB(i);
1045 		}
1046 		break;
1047 
1048 	case TDOUBLE:
1049 	case TFLOAT:
1050 		i = BtoF(~b);
1051 		if(i && r->cost > 0) {
1052 			r->regno = i;
1053 			return FtoB(i);
1054 		}
1055 		break;
1056 	}
1057 	return 0;
1058 }
1059 
1060 void
1061 paint1(Reg *r, int bn)
1062 {
1063 	Reg *r1;
1064 	Prog *p;
1065 	int z;
1066 	ulong bb;
1067 
1068 	z = bn/32;
1069 	bb = 1L<<(bn%32);
1070 	if(r->act.b[z] & bb)
1071 		return;
1072 	for(;;) {
1073 		if(!(r->refbehind.b[z] & bb))
1074 			break;
1075 		r1 = r->p1;
1076 		if(r1 == R)
1077 			break;
1078 		if(!(r1->refahead.b[z] & bb))
1079 			break;
1080 		if(r1->act.b[z] & bb)
1081 			break;
1082 		r = r1;
1083 	}
1084 
1085 	if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1086 		change -= CLOAD * r->loop;
1087 		if(debug['R'] && debug['v'])
1088 			print("%ld%P\tld %B $%d\n", r->loop,
1089 				r->prog, blsh(bn), change);
1090 	}
1091 	for(;;) {
1092 		r->act.b[z] |= bb;
1093 		p = r->prog;
1094 
1095 		if(r->use1.b[z] & bb) {
1096 			change += CREF * r->loop;
1097 			if(debug['R'] && debug['v'])
1098 				print("%ld%P\tu1 %B $%d\n", r->loop,
1099 					p, blsh(bn), change);
1100 		}
1101 
1102 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1103 			change += CREF * r->loop;
1104 			if(debug['R'] && debug['v'])
1105 				print("%ld%P\tu2 %B $%d\n", r->loop,
1106 					p, blsh(bn), change);
1107 		}
1108 
1109 		if(STORE(r) & r->regdiff.b[z] & bb) {
1110 			change -= CLOAD * r->loop;
1111 			if(debug['R'] && debug['v'])
1112 				print("%ld%P\tst %B $%d\n", r->loop,
1113 					p, blsh(bn), change);
1114 		}
1115 
1116 		if(r->refbehind.b[z] & bb)
1117 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1118 				if(r1->refahead.b[z] & bb)
1119 					paint1(r1, bn);
1120 
1121 		if(!(r->refahead.b[z] & bb))
1122 			break;
1123 		r1 = r->s2;
1124 		if(r1 != R)
1125 			if(r1->refbehind.b[z] & bb)
1126 				paint1(r1, bn);
1127 		r = r->s1;
1128 		if(r == R)
1129 			break;
1130 		if(r->act.b[z] & bb)
1131 			break;
1132 		if(!(r->refbehind.b[z] & bb))
1133 			break;
1134 	}
1135 }
1136 
1137 ulong
1138 regset(Reg *r, ulong bb)
1139 {
1140 	ulong b, set;
1141 	Adr v;
1142 	int c;
1143 
1144 	set = 0;
1145 	v = zprog.from;
1146 	while(b = bb & ~(bb-1)) {
1147 		v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1148 		if(v.type == 0)
1149 			diag(Z, "zero v.type for %#lux", b);
1150 		c = copyu(r->prog, &v, A);
1151 		if(c == 3)
1152 			set |= b;
1153 		bb &= ~b;
1154 	}
1155 	return set;
1156 }
1157 
1158 ulong
1159 reguse(Reg *r, ulong bb)
1160 {
1161 	ulong b, set;
1162 	Adr v;
1163 	int c;
1164 
1165 	set = 0;
1166 	v = zprog.from;
1167 	while(b = bb & ~(bb-1)) {
1168 		v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1169 		c = copyu(r->prog, &v, A);
1170 		if(c == 1 || c == 2 || c == 4)
1171 			set |= b;
1172 		bb &= ~b;
1173 	}
1174 	return set;
1175 }
1176 
1177 ulong
1178 paint2(Reg *r, int bn)
1179 {
1180 	Reg *r1;
1181 	int z;
1182 	ulong bb, vreg, x;
1183 
1184 	z = bn/32;
1185 	bb = 1L << (bn%32);
1186 	vreg = regbits;
1187 	if(!(r->act.b[z] & bb))
1188 		return vreg;
1189 	for(;;) {
1190 		if(!(r->refbehind.b[z] & bb))
1191 			break;
1192 		r1 = r->p1;
1193 		if(r1 == R)
1194 			break;
1195 		if(!(r1->refahead.b[z] & bb))
1196 			break;
1197 		if(!(r1->act.b[z] & bb))
1198 			break;
1199 		r = r1;
1200 	}
1201 	for(;;) {
1202 		r->act.b[z] &= ~bb;
1203 
1204 		vreg |= r->regu;
1205 
1206 		if(r->refbehind.b[z] & bb)
1207 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1208 				if(r1->refahead.b[z] & bb)
1209 					vreg |= paint2(r1, bn);
1210 
1211 		if(!(r->refahead.b[z] & bb))
1212 			break;
1213 		r1 = r->s2;
1214 		if(r1 != R)
1215 			if(r1->refbehind.b[z] & bb)
1216 				vreg |= paint2(r1, bn);
1217 		r = r->s1;
1218 		if(r == R)
1219 			break;
1220 		if(!(r->act.b[z] & bb))
1221 			break;
1222 		if(!(r->refbehind.b[z] & bb))
1223 			break;
1224 	}
1225 
1226 	bb = vreg;
1227 	for(; r; r=r->s1) {
1228 		x = r->regu & ~bb;
1229 		if(x) {
1230 			vreg |= reguse(r, x);
1231 			bb |= regset(r, x);
1232 		}
1233 	}
1234 	return vreg;
1235 }
1236 
1237 void
1238 paint3(Reg *r, int bn, long rb, int rn)
1239 {
1240 	Reg *r1;
1241 	Prog *p;
1242 	int z;
1243 	ulong bb;
1244 
1245 	z = bn/32;
1246 	bb = 1L << (bn%32);
1247 	if(r->act.b[z] & bb)
1248 		return;
1249 	for(;;) {
1250 		if(!(r->refbehind.b[z] & bb))
1251 			break;
1252 		r1 = r->p1;
1253 		if(r1 == R)
1254 			break;
1255 		if(!(r1->refahead.b[z] & bb))
1256 			break;
1257 		if(r1->act.b[z] & bb)
1258 			break;
1259 		r = r1;
1260 	}
1261 
1262 	if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1263 		addmove(r, bn, rn, 0);
1264 	for(;;) {
1265 		r->act.b[z] |= bb;
1266 		p = r->prog;
1267 
1268 		if(r->use1.b[z] & bb) {
1269 			if(debug['R'])
1270 				print("%P", p);
1271 			addreg(&p->from, rn);
1272 			if(debug['R'])
1273 				print("\t.c%P\n", p);
1274 		}
1275 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1276 			if(debug['R'])
1277 				print("%P", p);
1278 			addreg(&p->to, rn);
1279 			if(debug['R'])
1280 				print("\t.c%P\n", p);
1281 		}
1282 
1283 		if(STORE(r) & r->regdiff.b[z] & bb)
1284 			addmove(r, bn, rn, 1);
1285 		r->regu |= rb;
1286 
1287 		if(r->refbehind.b[z] & bb)
1288 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1289 				if(r1->refahead.b[z] & bb)
1290 					paint3(r1, bn, rb, rn);
1291 
1292 		if(!(r->refahead.b[z] & bb))
1293 			break;
1294 		r1 = r->s2;
1295 		if(r1 != R)
1296 			if(r1->refbehind.b[z] & bb)
1297 				paint3(r1, bn, rb, rn);
1298 		r = r->s1;
1299 		if(r == R)
1300 			break;
1301 		if(r->act.b[z] & bb)
1302 			break;
1303 		if(!(r->refbehind.b[z] & bb))
1304 			break;
1305 	}
1306 }
1307 
1308 void
1309 addreg(Adr *a, int rn)
1310 {
1311 
1312 	a->sym = 0;
1313 	a->offset = 0;
1314 	a->type = rn;
1315 }
1316 
1317 long
1318 RtoB(int r)
1319 {
1320 
1321 	if(r < D_AX || r > D_R15)
1322 		return 0;
1323 	return 1L << (r-D_AX);
1324 }
1325 
1326 int
1327 BtoR(long b)
1328 {
1329 
1330 	b &= 0xffffL;
1331 	if(b == 0)
1332 		return 0;
1333 	return bitno(b) + D_AX;
1334 }
1335 
1336 /*
1337  *	bit	reg
1338  *	16	X5
1339  *	17	X6
1340  *	18	X7
1341  */
1342 long
1343 FtoB(int f)
1344 {
1345 	if(f < FREGMIN || f > FREGEXT)
1346 		return 0;
1347 	return 1L << (f - FREGMIN + 16);
1348 }
1349 
1350 int
1351 BtoF(long b)
1352 {
1353 
1354 	b &= 0x70000L;
1355 	if(b == 0)
1356 		return 0;
1357 	return bitno(b) - 16 + FREGMIN;
1358 }
1359