xref: /plan9-contrib/sys/src/cmd/6c/reg.c (revision 40d015479ed36701ae6dcfd8814f849fc6285e8d)
1 #include "gc.h"
2 
3 Reg*
rega(void)4 rega(void)
5 {
6 	Reg *r;
7 
8 	r = freer;
9 	if(r == R) {
10 		r = alloc(sizeof(*r));
11 	} else
12 		freer = r->link;
13 
14 	*r = zreg;
15 	return r;
16 }
17 
18 int
rcmp(const void * a1,const void * a2)19 rcmp(const void *a1, const void *a2)
20 {
21 	Rgn *p1, *p2;
22 	int c1, c2;
23 
24 	p1 = (Rgn*)a1;
25 	p2 = (Rgn*)a2;
26 	c1 = p2->cost;
27 	c2 = p1->cost;
28 	if(c1 -= c2)
29 		return c1;
30 	return p2->varno - p1->varno;
31 }
32 
33 void
regopt(Prog * p)34 regopt(Prog *p)
35 {
36 	Reg *r, *r1, *r2;
37 	Prog *p1;
38 	int i, z;
39 	long initpc, val, npc;
40 	ulong vreg;
41 	Bits bit;
42 	struct
43 	{
44 		long	m;
45 		long	c;
46 		Reg*	p;
47 	} log5[6], *lp;
48 
49 	firstr = R;
50 	lastr = R;
51 	nvar = 0;
52 	regbits = RtoB(D_SP) | RtoB(D_AX) | RtoB(D_X0);
53 	if(REGEXT)
54 		regbits |= RtoB(REGEXT) | RtoB(REGEXT-1);
55 	for(z=0; z<BITS; z++) {
56 		externs.b[z] = 0;
57 		params.b[z] = 0;
58 		consts.b[z] = 0;
59 		addrs.b[z] = 0;
60 	}
61 
62 	/*
63 	 * pass 1
64 	 * build aux data structure
65 	 * allocate pcs
66 	 * find use and set of variables
67 	 */
68 	val = 5L * 5L * 5L * 5L * 5L;
69 	lp = log5;
70 	for(i=0; i<5; i++) {
71 		lp->m = val;
72 		lp->c = 0;
73 		lp->p = R;
74 		val /= 5L;
75 		lp++;
76 	}
77 	val = 0;
78 	for(; p != P; p = p->link) {
79 		switch(p->as) {
80 		case ADATA:
81 		case AGLOBL:
82 		case ANAME:
83 		case ASIGNAME:
84 			continue;
85 		}
86 		r = rega();
87 		if(firstr == R) {
88 			firstr = r;
89 			lastr = r;
90 		} else {
91 			lastr->link = r;
92 			r->p1 = lastr;
93 			lastr->s1 = r;
94 			lastr = r;
95 		}
96 		r->prog = p;
97 		r->pc = val;
98 		val++;
99 
100 		lp = log5;
101 		for(i=0; i<5; i++) {
102 			lp->c--;
103 			if(lp->c <= 0) {
104 				lp->c = lp->m;
105 				if(lp->p != R)
106 					lp->p->log5 = r;
107 				lp->p = r;
108 				(lp+1)->c = 0;
109 				break;
110 			}
111 			lp++;
112 		}
113 
114 		r1 = r->p1;
115 		if(r1 != R)
116 		switch(r1->prog->as) {
117 		case ARET:
118 		case AJMP:
119 		case AIRETL:
120 		case AIRETQ:
121 			r->p1 = R;
122 			r1->s1 = R;
123 		}
124 
125 		bit = mkvar(r, &p->from);
126 		if(bany(&bit))
127 		switch(p->as) {
128 		/*
129 		 * funny
130 		 */
131 		case ALEAL:
132 		case ALEAQ:
133 			for(z=0; z<BITS; z++)
134 				addrs.b[z] |= bit.b[z];
135 			break;
136 
137 		/*
138 		 * left side read
139 		 */
140 		default:
141 			for(z=0; z<BITS; z++)
142 				r->use1.b[z] |= bit.b[z];
143 			break;
144 		}
145 
146 		bit = mkvar(r, &p->to);
147 		if(bany(&bit))
148 		switch(p->as) {
149 		default:
150 			diag(Z, "reg: unknown op: %A", p->as);
151 			break;
152 
153 		/*
154 		 * right side read
155 		 */
156 		case ACMPB:
157 		case ACMPL:
158 		case ACMPQ:
159 		case ACMPW:
160 		case ACOMISS:
161 		case ACOMISD:
162 		case AUCOMISS:
163 		case AUCOMISD:
164 			for(z=0; z<BITS; z++)
165 				r->use2.b[z] |= bit.b[z];
166 			break;
167 
168 		/*
169 		 * right side write
170 		 */
171 		case ANOP:
172 		case AMOVL:
173 		case AMOVQ:
174 		case AMOVB:
175 		case AMOVW:
176 		case AMOVBLSX:
177 		case AMOVBLZX:
178 		case AMOVBQSX:
179 		case AMOVBQZX:
180 		case AMOVLQSX:
181 		case AMOVLQZX:
182 		case AMOVWLSX:
183 		case AMOVWLZX:
184 		case AMOVWQSX:
185 		case AMOVWQZX:
186 		case AMOVQL:
187 
188 		case AMOVSS:
189 		case AMOVSD:
190 		case ACVTSD2SL:
191 		case ACVTSD2SQ:
192 		case ACVTSD2SS:
193 		case ACVTSL2SD:
194 		case ACVTSL2SS:
195 		case ACVTSQ2SD:
196 		case ACVTSQ2SS:
197 		case ACVTSS2SD:
198 		case ACVTSS2SL:
199 		case ACVTSS2SQ:
200 		case ACVTTSD2SL:
201 		case ACVTTSD2SQ:
202 		case ACVTTSS2SL:
203 		case ACVTTSS2SQ:
204 			for(z=0; z<BITS; z++)
205 				r->set.b[z] |= bit.b[z];
206 			break;
207 
208 		/*
209 		 * right side read+write
210 		 */
211 		case AADDB:
212 		case AADDL:
213 		case AADDQ:
214 		case AADDW:
215 		case AANDB:
216 		case AANDL:
217 		case AANDQ:
218 		case AANDW:
219 		case ASUBB:
220 		case ASUBL:
221 		case ASUBQ:
222 		case ASUBW:
223 		case AORB:
224 		case AORL:
225 		case AORQ:
226 		case AORW:
227 		case AXORB:
228 		case AXORL:
229 		case AXORQ:
230 		case AXORW:
231 		case ASALB:
232 		case ASALL:
233 		case ASALQ:
234 		case ASALW:
235 		case ASARB:
236 		case ASARL:
237 		case ASARQ:
238 		case ASARW:
239 		case AROLB:
240 		case AROLL:
241 		case AROLQ:
242 		case AROLW:
243 		case ARORB:
244 		case ARORL:
245 		case ARORQ:
246 		case ARORW:
247 		case ASHLB:
248 		case ASHLL:
249 		case ASHLQ:
250 		case ASHLW:
251 		case ASHRB:
252 		case ASHRL:
253 		case ASHRQ:
254 		case ASHRW:
255 		case AIMULL:
256 		case AIMULQ:
257 		case AIMULW:
258 		case ANEGL:
259 		case ANEGQ:
260 		case ANOTL:
261 		case ANOTQ:
262 		case AADCL:
263 		case AADCQ:
264 		case ASBBL:
265 		case ASBBQ:
266 
267 		case AADDSD:
268 		case AADDSS:
269 		case ACMPSD:
270 		case ACMPSS:
271 		case ADIVSD:
272 		case ADIVSS:
273 		case AMAXSD:
274 		case AMAXSS:
275 		case AMINSD:
276 		case AMINSS:
277 		case AMULSD:
278 		case AMULSS:
279 		case ARCPSS:
280 		case ARSQRTSS:
281 		case ASQRTSD:
282 		case ASQRTSS:
283 		case ASUBSD:
284 		case ASUBSS:
285 		case AXORPD:
286 			for(z=0; z<BITS; z++) {
287 				r->set.b[z] |= bit.b[z];
288 				r->use2.b[z] |= bit.b[z];
289 			}
290 			break;
291 
292 		/*
293 		 * funny
294 		 */
295 		case ACALL:
296 			for(z=0; z<BITS; z++)
297 				addrs.b[z] |= bit.b[z];
298 			break;
299 		}
300 
301 		switch(p->as) {
302 		case AIMULL:
303 		case AIMULQ:
304 		case AIMULW:
305 			if(p->to.type != D_NONE)
306 				break;
307 
308 		case AIDIVB:
309 		case AIDIVL:
310 		case AIDIVQ:
311 		case AIDIVW:
312 		case AIMULB:
313 		case ADIVB:
314 		case ADIVL:
315 		case ADIVQ:
316 		case ADIVW:
317 		case AMULB:
318 		case AMULL:
319 		case AMULQ:
320 		case AMULW:
321 
322 		case ACWD:
323 		case ACDQ:
324 		case ACQO:
325 			r->regu |= RtoB(D_AX) | RtoB(D_DX);
326 			break;
327 
328 		case AREP:
329 		case AREPN:
330 		case ALOOP:
331 		case ALOOPEQ:
332 		case ALOOPNE:
333 			r->regu |= RtoB(D_CX);
334 			break;
335 
336 		case AMOVSB:
337 		case AMOVSL:
338 		case AMOVSQ:
339 		case AMOVSW:
340 		case ACMPSB:
341 		case ACMPSL:
342 		case ACMPSQ:
343 		case ACMPSW:
344 			r->regu |= RtoB(D_SI) | RtoB(D_DI);
345 			break;
346 
347 		case ASTOSB:
348 		case ASTOSL:
349 		case ASTOSQ:
350 		case ASTOSW:
351 		case ASCASB:
352 		case ASCASL:
353 		case ASCASQ:
354 		case ASCASW:
355 			r->regu |= RtoB(D_AX) | RtoB(D_DI);
356 			break;
357 
358 		case AINSB:
359 		case AINSL:
360 		case AINSW:
361 		case AOUTSB:
362 		case AOUTSL:
363 		case AOUTSW:
364 			r->regu |= RtoB(D_DI) | RtoB(D_DX);
365 			break;
366 		}
367 	}
368 	if(firstr == R)
369 		return;
370 	initpc = pc - val;
371 	npc = val;
372 
373 	/*
374 	 * pass 2
375 	 * turn branch references to pointers
376 	 * build back pointers
377 	 */
378 	for(r = firstr; r != R; r = r->link) {
379 		p = r->prog;
380 		if(p->to.type == D_BRANCH) {
381 			val = p->to.offset - initpc;
382 			r1 = firstr;
383 			while(r1 != R) {
384 				r2 = r1->log5;
385 				if(r2 != R && val >= r2->pc) {
386 					r1 = r2;
387 					continue;
388 				}
389 				if(r1->pc == val)
390 					break;
391 				r1 = r1->link;
392 			}
393 			if(r1 == R) {
394 				nearln = p->lineno;
395 				diag(Z, "ref not found\n%P", p);
396 				continue;
397 			}
398 			if(r1 == r) {
399 				nearln = p->lineno;
400 				diag(Z, "ref to self\n%P", p);
401 				continue;
402 			}
403 			r->s2 = r1;
404 			r->p2link = r1->p2;
405 			r1->p2 = r;
406 		}
407 	}
408 	if(debug['R']) {
409 		p = firstr->prog;
410 		print("\n%L %D\n", p->lineno, &p->from);
411 	}
412 
413 	/*
414 	 * pass 2.5
415 	 * find looping structure
416 	 */
417 	for(r = firstr; r != R; r = r->link)
418 		r->active = 0;
419 	change = 0;
420 	loopit(firstr, npc);
421 	if(debug['R'] && debug['v']) {
422 		print("\nlooping structure:\n");
423 		for(r = firstr; r != R; r = r->link) {
424 			print("%ld:%P", r->loop, r->prog);
425 			for(z=0; z<BITS; z++)
426 				bit.b[z] = r->use1.b[z] |
427 					   r->use2.b[z] |
428 					   r->set.b[z];
429 			if(bany(&bit)) {
430 				print("\t");
431 				if(bany(&r->use1))
432 					print(" u1=%B", r->use1);
433 				if(bany(&r->use2))
434 					print(" u2=%B", r->use2);
435 				if(bany(&r->set))
436 					print(" st=%B", r->set);
437 			}
438 			print("\n");
439 		}
440 	}
441 
442 	/*
443 	 * pass 3
444 	 * iterate propagating usage
445 	 * 	back until flow graph is complete
446 	 */
447 loop1:
448 	change = 0;
449 	for(r = firstr; r != R; r = r->link)
450 		r->active = 0;
451 	for(r = firstr; r != R; r = r->link)
452 		if(r->prog->as == ARET)
453 			prop(r, zbits, zbits);
454 loop11:
455 	/* pick up unreachable code */
456 	i = 0;
457 	for(r = firstr; r != R; r = r1) {
458 		r1 = r->link;
459 		if(r1 && r1->active && !r->active) {
460 			prop(r, zbits, zbits);
461 			i = 1;
462 		}
463 	}
464 	if(i)
465 		goto loop11;
466 	if(change)
467 		goto loop1;
468 
469 
470 	/*
471 	 * pass 4
472 	 * iterate propagating register/variable synchrony
473 	 * 	forward until graph is complete
474 	 */
475 loop2:
476 	change = 0;
477 	for(r = firstr; r != R; r = r->link)
478 		r->active = 0;
479 	synch(firstr, zbits);
480 	if(change)
481 		goto loop2;
482 
483 
484 	/*
485 	 * pass 5
486 	 * isolate regions
487 	 * calculate costs (paint1)
488 	 */
489 	r = firstr;
490 	if(r) {
491 		for(z=0; z<BITS; z++)
492 			bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
493 			  ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
494 		if(bany(&bit)) {
495 			nearln = r->prog->lineno;
496 			warn(Z, "used and not set: %B", bit);
497 			if(debug['R'] && !debug['w'])
498 				print("used and not set: %B\n", bit);
499 		}
500 	}
501 	if(debug['R'] && debug['v'])
502 		print("\nprop structure:\n");
503 	for(r = firstr; r != R; r = r->link)
504 		r->act = zbits;
505 	rgp = region;
506 	nregion = 0;
507 	for(r = firstr; r != R; r = r->link) {
508 		if(debug['R'] && debug['v']) {
509 			print("%P\t", r->prog);
510 			if(bany(&r->set))
511 				print("s:%B ", r->set);
512 			if(bany(&r->refahead))
513 				print("ra:%B ", r->refahead);
514 			if(bany(&r->calahead))
515 				print("ca:%B ", r->calahead);
516 			print("\n");
517 		}
518 		for(z=0; z<BITS; z++)
519 			bit.b[z] = r->set.b[z] &
520 			  ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
521 		if(bany(&bit)) {
522 			nearln = r->prog->lineno;
523 			warn(Z, "set and not used: %B", bit);
524 			if(debug['R'])
525 				print("set and not used: %B\n", bit);
526 			excise(r);
527 		}
528 		for(z=0; z<BITS; z++)
529 			bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
530 		while(bany(&bit)) {
531 			i = bnum(bit);
532 			rgp->enter = r;
533 			rgp->varno = i;
534 			change = 0;
535 			if(debug['R'] && debug['v'])
536 				print("\n");
537 			paint1(r, i);
538 			bit.b[i/32] &= ~(1L<<(i%32));
539 			if(change <= 0) {
540 				if(debug['R'])
541 					print("%L$%d: %B\n",
542 						r->prog->lineno, change, blsh(i));
543 				continue;
544 			}
545 			rgp->cost = change;
546 			nregion++;
547 			if(nregion >= NRGN) {
548 				warn(Z, "too many regions");
549 				goto brk;
550 			}
551 			rgp++;
552 		}
553 	}
554 brk:
555 	qsort(region, nregion, sizeof(region[0]), rcmp);
556 
557 	/*
558 	 * pass 6
559 	 * determine used registers (paint2)
560 	 * replace code (paint3)
561 	 */
562 	rgp = region;
563 	for(i=0; i<nregion; i++) {
564 		bit = blsh(rgp->varno);
565 		vreg = paint2(rgp->enter, rgp->varno);
566 		vreg = allreg(vreg, rgp);
567 		if(debug['R']) {
568 			print("%L$%d %R: %B\n",
569 				rgp->enter->prog->lineno,
570 				rgp->cost,
571 				rgp->regno,
572 				bit);
573 		}
574 		if(rgp->regno != 0)
575 			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
576 		rgp++;
577 	}
578 	/*
579 	 * pass 7
580 	 * peep-hole on basic block
581 	 */
582 	if(!debug['R'] || debug['P'])
583 		peep();
584 
585 	/*
586 	 * pass 8
587 	 * recalculate pc
588 	 */
589 	val = initpc;
590 	for(r = firstr; r != R; r = r1) {
591 		r->pc = val;
592 		p = r->prog;
593 		p1 = P;
594 		r1 = r->link;
595 		if(r1 != R)
596 			p1 = r1->prog;
597 		for(; p != p1; p = p->link) {
598 			switch(p->as) {
599 			default:
600 				val++;
601 				break;
602 
603 			case ANOP:
604 			case ADATA:
605 			case AGLOBL:
606 			case ANAME:
607 			case ASIGNAME:
608 				break;
609 			}
610 		}
611 	}
612 	pc = val;
613 
614 	/*
615 	 * fix up branches
616 	 */
617 	if(debug['R'])
618 		if(bany(&addrs))
619 			print("addrs: %B\n", addrs);
620 
621 	r1 = 0; /* set */
622 	for(r = firstr; r != R; r = r->link) {
623 		p = r->prog;
624 		if(p->to.type == D_BRANCH)
625 			p->to.offset = r->s2->pc;
626 		r1 = r;
627 	}
628 
629 	/*
630 	 * last pass
631 	 * eliminate nops
632 	 * free aux structures
633 	 */
634 	for(p = firstr->prog; p != P; p = p->link){
635 		while(p->link && p->link->as == ANOP)
636 			p->link = p->link->link;
637 	}
638 	if(r1 != R) {
639 		r1->link = freer;
640 		freer = firstr;
641 	}
642 }
643 
644 /*
645  * add mov b,rn
646  * just after r
647  */
648 void
addmove(Reg * r,int bn,int rn,int f)649 addmove(Reg *r, int bn, int rn, int f)
650 {
651 	Prog *p, *p1;
652 	Adr *a;
653 	Var *v;
654 
655 	p1 = alloc(sizeof(*p1));
656 	*p1 = zprog;
657 	p = r->prog;
658 
659 	p1->link = p->link;
660 	p->link = p1;
661 	p1->lineno = p->lineno;
662 
663 	v = var + bn;
664 
665 	a = &p1->to;
666 	a->sym = v->sym;
667 	a->offset = v->offset;
668 	a->etype = v->etype;
669 	a->type = v->name;
670 
671 	p1->as = AMOVL;
672 	if(v->etype == TCHAR || v->etype == TUCHAR)
673 		p1->as = AMOVB;
674 	if(v->etype == TSHORT || v->etype == TUSHORT)
675 		p1->as = AMOVW;
676 	if(v->etype == TVLONG || v->etype == TUVLONG || v->etype == TIND)
677 		p1->as = AMOVQ;
678 	if(v->etype == TFLOAT)
679 		p1->as = AMOVSS;
680 	if(v->etype == TDOUBLE)
681 		p1->as = AMOVSD;
682 
683 	p1->from.type = rn;
684 	if(!f) {
685 		p1->from = *a;
686 		*a = zprog.from;
687 		a->type = rn;
688 		if(v->etype == TUCHAR)
689 			p1->as = AMOVB;
690 		if(v->etype == TUSHORT)
691 			p1->as = AMOVW;
692 	}
693 	if(debug['R'])
694 		print("%P\t.a%P\n", p, p1);
695 }
696 
697 ulong
doregbits(int r)698 doregbits(int r)
699 {
700 	ulong b;
701 
702 	b = 0;
703 	if(r >= D_INDIR)
704 		r -= D_INDIR;
705 	if(r >= D_AX && r <= D_R15)
706 		b |= RtoB(r);
707 	else
708 	if(r >= D_AL && r <= D_R15B)
709 		b |= RtoB(r-D_AL+D_AX);
710 	else
711 	if(r >= D_AH && r <= D_BH)
712 		b |= RtoB(r-D_AH+D_AX);
713 	else
714 	if(r >= D_X0 && r <= D_X0+15)
715 		b |= FtoB(r);
716 	return b;
717 }
718 
719 Bits
mkvar(Reg * r,Adr * a)720 mkvar(Reg *r, Adr *a)
721 {
722 	Var *v;
723 	int i, t, n, et, z;
724 	long o;
725 	Bits bit;
726 	Sym *s;
727 
728 	/*
729 	 * mark registers used
730 	 */
731 	t = a->type;
732 	r->regu |= doregbits(t);
733 	r->regu |= doregbits(a->index);
734 
735 	s = a->sym;
736 	switch(t) {
737 	default:
738 		goto none;
739 	case D_ADDR:
740 		a->type = a->index;
741 		bit = mkvar(r, a);
742 		for(z=0; z<BITS; z++)
743 			addrs.b[z] |= bit.b[z];
744 		a->type = t;
745 		goto none;
746 	case D_EXTERN:
747 	case D_STATIC:
748 		if(s->type != nil && s->type->garb & GVOLATILE)
749 			goto none;
750 	case D_PARAM:
751 	case D_AUTO:
752 		n = t;
753 		break;
754 	}
755 	if(s == S)
756 		goto none;
757 	if(s->name[0] == '.')
758 		goto none;
759 	et = a->etype;
760 	o = a->offset;
761 	v = var;
762 	for(i=0; i<nvar; i++) {
763 		if(s == v->sym)
764 		if(n == v->name)
765 		if(o == v->offset)
766 			goto out;
767 		v++;
768 	}
769 	if(nvar >= NVAR) {
770 		if(debug['w'] > 1 && s)
771 			warn(Z, "variable not optimized: %s", s->name);
772 		goto none;
773 	}
774 	i = nvar;
775 	nvar++;
776 	v = &var[i];
777 	v->sym = s;
778 	v->offset = o;
779 	v->name = n;
780 	v->etype = et;
781 	if(debug['R'])
782 		print("bit=%2d et=%2d %D\n", i, et, a);
783 
784 out:
785 	bit = blsh(i);
786 	if(n == D_EXTERN || n == D_STATIC)
787 		for(z=0; z<BITS; z++)
788 			externs.b[z] |= bit.b[z];
789 	if(n == D_PARAM)
790 		for(z=0; z<BITS; z++)
791 			params.b[z] |= bit.b[z];
792 	if(v->etype != et || !(typechlpfd[et] || typev[et]))	/* funny punning */
793 		for(z=0; z<BITS; z++)
794 			addrs.b[z] |= bit.b[z];
795 	return bit;
796 
797 none:
798 	return zbits;
799 }
800 
801 void
prop(Reg * r,Bits ref,Bits cal)802 prop(Reg *r, Bits ref, Bits cal)
803 {
804 	Reg *r1, *r2;
805 	int z;
806 
807 	for(r1 = r; r1 != R; r1 = r1->p1) {
808 		for(z=0; z<BITS; z++) {
809 			ref.b[z] |= r1->refahead.b[z];
810 			if(ref.b[z] != r1->refahead.b[z]) {
811 				r1->refahead.b[z] = ref.b[z];
812 				change++;
813 			}
814 			cal.b[z] |= r1->calahead.b[z];
815 			if(cal.b[z] != r1->calahead.b[z]) {
816 				r1->calahead.b[z] = cal.b[z];
817 				change++;
818 			}
819 		}
820 		switch(r1->prog->as) {
821 		case ACALL:
822 			for(z=0; z<BITS; z++) {
823 				cal.b[z] |= ref.b[z] | externs.b[z];
824 				ref.b[z] = 0;
825 			}
826 			break;
827 
828 		case ATEXT:
829 			for(z=0; z<BITS; z++) {
830 				cal.b[z] = 0;
831 				ref.b[z] = 0;
832 			}
833 			break;
834 
835 		case ARET:
836 			for(z=0; z<BITS; z++) {
837 				cal.b[z] = externs.b[z];
838 				ref.b[z] = 0;
839 			}
840 		}
841 		for(z=0; z<BITS; z++) {
842 			ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
843 				r1->use1.b[z] | r1->use2.b[z];
844 			cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
845 			r1->refbehind.b[z] = ref.b[z];
846 			r1->calbehind.b[z] = cal.b[z];
847 		}
848 		if(r1->active)
849 			break;
850 		r1->active = 1;
851 	}
852 	for(; r != r1; r = r->p1)
853 		for(r2 = r->p2; r2 != R; r2 = r2->p2link)
854 			prop(r2, r->refbehind, r->calbehind);
855 }
856 
857 /*
858  * find looping structure
859  *
860  * 1) find reverse postordering
861  * 2) find approximate dominators,
862  *	the actual dominators if the flow graph is reducible
863  *	otherwise, dominators plus some other non-dominators.
864  *	See Matthew S. Hecht and Jeffrey D. Ullman,
865  *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
866  *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
867  *	Oct. 1-3, 1973, pp.  207-217.
868  * 3) find all nodes with a predecessor dominated by the current node.
869  *	such a node is a loop head.
870  *	recursively, all preds with a greater rpo number are in the loop
871  */
872 long
postorder(Reg * r,Reg ** rpo2r,long n)873 postorder(Reg *r, Reg **rpo2r, long n)
874 {
875 	Reg *r1;
876 
877 	r->rpo = 1;
878 	r1 = r->s1;
879 	if(r1 && !r1->rpo)
880 		n = postorder(r1, rpo2r, n);
881 	r1 = r->s2;
882 	if(r1 && !r1->rpo)
883 		n = postorder(r1, rpo2r, n);
884 	rpo2r[n] = r;
885 	n++;
886 	return n;
887 }
888 
889 long
rpolca(long * idom,long rpo1,long rpo2)890 rpolca(long *idom, long rpo1, long rpo2)
891 {
892 	long t;
893 
894 	if(rpo1 == -1)
895 		return rpo2;
896 	while(rpo1 != rpo2){
897 		if(rpo1 > rpo2){
898 			t = rpo2;
899 			rpo2 = rpo1;
900 			rpo1 = t;
901 		}
902 		while(rpo1 < rpo2){
903 			t = idom[rpo2];
904 			if(t >= rpo2)
905 				fatal(Z, "bad idom");
906 			rpo2 = t;
907 		}
908 	}
909 	return rpo1;
910 }
911 
912 int
doms(long * idom,long r,long s)913 doms(long *idom, long r, long s)
914 {
915 	while(s > r)
916 		s = idom[s];
917 	return s == r;
918 }
919 
920 int
loophead(long * idom,Reg * r)921 loophead(long *idom, Reg *r)
922 {
923 	long src;
924 
925 	src = r->rpo;
926 	if(r->p1 != R && doms(idom, src, r->p1->rpo))
927 		return 1;
928 	for(r = r->p2; r != R; r = r->p2link)
929 		if(doms(idom, src, r->rpo))
930 			return 1;
931 	return 0;
932 }
933 
934 void
loopmark(Reg ** rpo2r,long head,Reg * r)935 loopmark(Reg **rpo2r, long head, Reg *r)
936 {
937 	if(r->rpo < head || r->active == head)
938 		return;
939 	r->active = head;
940 	r->loop += LOOP;
941 	if(r->p1 != R)
942 		loopmark(rpo2r, head, r->p1);
943 	for(r = r->p2; r != R; r = r->p2link)
944 		loopmark(rpo2r, head, r);
945 }
946 
947 void
loopit(Reg * r,long nr)948 loopit(Reg *r, long nr)
949 {
950 	Reg *r1;
951 	long i, d, me;
952 
953 	if(nr > maxnr) {
954 		rpo2r = alloc(nr * sizeof(Reg*));
955 		idom = alloc(nr * sizeof(long));
956 		maxnr = nr;
957 	}
958 
959 	d = postorder(r, rpo2r, 0);
960 	if(d > nr)
961 		fatal(Z, "too many reg nodes");
962 	nr = d;
963 	for(i = 0; i < nr / 2; i++){
964 		r1 = rpo2r[i];
965 		rpo2r[i] = rpo2r[nr - 1 - i];
966 		rpo2r[nr - 1 - i] = r1;
967 	}
968 	for(i = 0; i < nr; i++)
969 		rpo2r[i]->rpo = i;
970 
971 	idom[0] = 0;
972 	for(i = 0; i < nr; i++){
973 		r1 = rpo2r[i];
974 		me = r1->rpo;
975 		d = -1;
976 		if(r1->p1 != R && r1->p1->rpo < me)
977 			d = r1->p1->rpo;
978 		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
979 			if(r1->rpo < me)
980 				d = rpolca(idom, d, r1->rpo);
981 		idom[i] = d;
982 	}
983 
984 	for(i = 0; i < nr; i++){
985 		r1 = rpo2r[i];
986 		r1->loop++;
987 		if(r1->p2 != R && loophead(idom, r1))
988 			loopmark(rpo2r, i, r1);
989 	}
990 }
991 
992 void
synch(Reg * r,Bits dif)993 synch(Reg *r, Bits dif)
994 {
995 	Reg *r1;
996 	int z;
997 
998 	for(r1 = r; r1 != R; r1 = r1->s1) {
999 		for(z=0; z<BITS; z++) {
1000 			dif.b[z] = (dif.b[z] &
1001 				~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
1002 					r1->set.b[z] | r1->regdiff.b[z];
1003 			if(dif.b[z] != r1->regdiff.b[z]) {
1004 				r1->regdiff.b[z] = dif.b[z];
1005 				change++;
1006 			}
1007 		}
1008 		if(r1->active)
1009 			break;
1010 		r1->active = 1;
1011 		for(z=0; z<BITS; z++)
1012 			dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
1013 		if(r1->s2 != R)
1014 			synch(r1->s2, dif);
1015 	}
1016 }
1017 
1018 ulong
allreg(ulong b,Rgn * r)1019 allreg(ulong b, Rgn *r)
1020 {
1021 	Var *v;
1022 	int i;
1023 
1024 	v = var + r->varno;
1025 	r->regno = 0;
1026 	switch(v->etype) {
1027 
1028 	default:
1029 		diag(Z, "unknown etype %d/%d", bitno(b), v->etype);
1030 		break;
1031 
1032 	case TCHAR:
1033 	case TUCHAR:
1034 	case TSHORT:
1035 	case TUSHORT:
1036 	case TINT:
1037 	case TUINT:
1038 	case TLONG:
1039 	case TULONG:
1040 	case TVLONG:
1041 	case TUVLONG:
1042 	case TIND:
1043 	case TARRAY:
1044 		i = BtoR(~b);
1045 		if(i && r->cost > 0) {
1046 			r->regno = i;
1047 			return RtoB(i);
1048 		}
1049 		break;
1050 
1051 	case TDOUBLE:
1052 	case TFLOAT:
1053 		i = BtoF(~b);
1054 		if(i && r->cost > 0) {
1055 			r->regno = i;
1056 			return FtoB(i);
1057 		}
1058 		break;
1059 	}
1060 	return 0;
1061 }
1062 
1063 void
paint1(Reg * r,int bn)1064 paint1(Reg *r, int bn)
1065 {
1066 	Reg *r1;
1067 	Prog *p;
1068 	int z;
1069 	ulong bb;
1070 
1071 	z = bn/32;
1072 	bb = 1L<<(bn%32);
1073 	if(r->act.b[z] & bb)
1074 		return;
1075 	for(;;) {
1076 		if(!(r->refbehind.b[z] & bb))
1077 			break;
1078 		r1 = r->p1;
1079 		if(r1 == R)
1080 			break;
1081 		if(!(r1->refahead.b[z] & bb))
1082 			break;
1083 		if(r1->act.b[z] & bb)
1084 			break;
1085 		r = r1;
1086 	}
1087 
1088 	if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
1089 		change -= CLOAD * r->loop;
1090 		if(debug['R'] && debug['v'])
1091 			print("%ld%P\tld %B $%d\n", r->loop,
1092 				r->prog, blsh(bn), change);
1093 	}
1094 	for(;;) {
1095 		r->act.b[z] |= bb;
1096 		p = r->prog;
1097 
1098 		if(r->use1.b[z] & bb) {
1099 			change += CREF * r->loop;
1100 			if(debug['R'] && debug['v'])
1101 				print("%ld%P\tu1 %B $%d\n", r->loop,
1102 					p, blsh(bn), change);
1103 		}
1104 
1105 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1106 			change += CREF * r->loop;
1107 			if(debug['R'] && debug['v'])
1108 				print("%ld%P\tu2 %B $%d\n", r->loop,
1109 					p, blsh(bn), change);
1110 		}
1111 
1112 		if(STORE(r) & r->regdiff.b[z] & bb) {
1113 			change -= CLOAD * r->loop;
1114 			if(debug['R'] && debug['v'])
1115 				print("%ld%P\tst %B $%d\n", r->loop,
1116 					p, blsh(bn), change);
1117 		}
1118 
1119 		if(r->refbehind.b[z] & bb)
1120 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1121 				if(r1->refahead.b[z] & bb)
1122 					paint1(r1, bn);
1123 
1124 		if(!(r->refahead.b[z] & bb))
1125 			break;
1126 		r1 = r->s2;
1127 		if(r1 != R)
1128 			if(r1->refbehind.b[z] & bb)
1129 				paint1(r1, bn);
1130 		r = r->s1;
1131 		if(r == R)
1132 			break;
1133 		if(r->act.b[z] & bb)
1134 			break;
1135 		if(!(r->refbehind.b[z] & bb))
1136 			break;
1137 	}
1138 }
1139 
1140 ulong
regset(Reg * r,ulong bb)1141 regset(Reg *r, ulong bb)
1142 {
1143 	ulong b, set;
1144 	Adr v;
1145 	int c;
1146 
1147 	set = 0;
1148 	v = zprog.from;
1149 	while(b = bb & ~(bb-1)) {
1150 		v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1151 		if(v.type == 0)
1152 			diag(Z, "zero v.type for %#lux", b);
1153 		c = copyu(r->prog, &v, A);
1154 		if(c == 3)
1155 			set |= b;
1156 		bb &= ~b;
1157 	}
1158 	return set;
1159 }
1160 
1161 ulong
reguse(Reg * r,ulong bb)1162 reguse(Reg *r, ulong bb)
1163 {
1164 	ulong b, set;
1165 	Adr v;
1166 	int c;
1167 
1168 	set = 0;
1169 	v = zprog.from;
1170 	while(b = bb & ~(bb-1)) {
1171 		v.type = b & 0xFFFF? BtoR(b): BtoF(b);
1172 		c = copyu(r->prog, &v, A);
1173 		if(c == 1 || c == 2 || c == 4)
1174 			set |= b;
1175 		bb &= ~b;
1176 	}
1177 	return set;
1178 }
1179 
1180 ulong
paint2(Reg * r,int bn)1181 paint2(Reg *r, int bn)
1182 {
1183 	Reg *r1;
1184 	int z;
1185 	ulong bb, vreg, x;
1186 
1187 	z = bn/32;
1188 	bb = 1L << (bn%32);
1189 	vreg = regbits;
1190 	if(!(r->act.b[z] & bb))
1191 		return vreg;
1192 	for(;;) {
1193 		if(!(r->refbehind.b[z] & bb))
1194 			break;
1195 		r1 = r->p1;
1196 		if(r1 == R)
1197 			break;
1198 		if(!(r1->refahead.b[z] & bb))
1199 			break;
1200 		if(!(r1->act.b[z] & bb))
1201 			break;
1202 		r = r1;
1203 	}
1204 	for(;;) {
1205 		r->act.b[z] &= ~bb;
1206 
1207 		vreg |= r->regu;
1208 
1209 		if(r->refbehind.b[z] & bb)
1210 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1211 				if(r1->refahead.b[z] & bb)
1212 					vreg |= paint2(r1, bn);
1213 
1214 		if(!(r->refahead.b[z] & bb))
1215 			break;
1216 		r1 = r->s2;
1217 		if(r1 != R)
1218 			if(r1->refbehind.b[z] & bb)
1219 				vreg |= paint2(r1, bn);
1220 		r = r->s1;
1221 		if(r == R)
1222 			break;
1223 		if(!(r->act.b[z] & bb))
1224 			break;
1225 		if(!(r->refbehind.b[z] & bb))
1226 			break;
1227 	}
1228 
1229 	bb = vreg;
1230 	for(; r; r=r->s1) {
1231 		x = r->regu & ~bb;
1232 		if(x) {
1233 			vreg |= reguse(r, x);
1234 			bb |= regset(r, x);
1235 		}
1236 	}
1237 	return vreg;
1238 }
1239 
1240 void
paint3(Reg * r,int bn,long rb,int rn)1241 paint3(Reg *r, int bn, long rb, int rn)
1242 {
1243 	Reg *r1;
1244 	Prog *p;
1245 	int z;
1246 	ulong bb;
1247 
1248 	z = bn/32;
1249 	bb = 1L << (bn%32);
1250 	if(r->act.b[z] & bb)
1251 		return;
1252 	for(;;) {
1253 		if(!(r->refbehind.b[z] & bb))
1254 			break;
1255 		r1 = r->p1;
1256 		if(r1 == R)
1257 			break;
1258 		if(!(r1->refahead.b[z] & bb))
1259 			break;
1260 		if(r1->act.b[z] & bb)
1261 			break;
1262 		r = r1;
1263 	}
1264 
1265 	if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
1266 		addmove(r, bn, rn, 0);
1267 	for(;;) {
1268 		r->act.b[z] |= bb;
1269 		p = r->prog;
1270 
1271 		if(r->use1.b[z] & bb) {
1272 			if(debug['R'])
1273 				print("%P", p);
1274 			addreg(&p->from, rn);
1275 			if(debug['R'])
1276 				print("\t.c%P\n", p);
1277 		}
1278 		if((r->use2.b[z]|r->set.b[z]) & bb) {
1279 			if(debug['R'])
1280 				print("%P", p);
1281 			addreg(&p->to, rn);
1282 			if(debug['R'])
1283 				print("\t.c%P\n", p);
1284 		}
1285 
1286 		if(STORE(r) & r->regdiff.b[z] & bb)
1287 			addmove(r, bn, rn, 1);
1288 		r->regu |= rb;
1289 
1290 		if(r->refbehind.b[z] & bb)
1291 			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
1292 				if(r1->refahead.b[z] & bb)
1293 					paint3(r1, bn, rb, rn);
1294 
1295 		if(!(r->refahead.b[z] & bb))
1296 			break;
1297 		r1 = r->s2;
1298 		if(r1 != R)
1299 			if(r1->refbehind.b[z] & bb)
1300 				paint3(r1, bn, rb, rn);
1301 		r = r->s1;
1302 		if(r == R)
1303 			break;
1304 		if(r->act.b[z] & bb)
1305 			break;
1306 		if(!(r->refbehind.b[z] & bb))
1307 			break;
1308 	}
1309 }
1310 
1311 void
addreg(Adr * a,int rn)1312 addreg(Adr *a, int rn)
1313 {
1314 
1315 	a->sym = 0;
1316 	a->offset = 0;
1317 	a->type = rn;
1318 }
1319 
1320 long
RtoB(int r)1321 RtoB(int r)
1322 {
1323 
1324 	if(r < D_AX || r > D_R15)
1325 		return 0;
1326 	return 1L << (r-D_AX);
1327 }
1328 
1329 int
BtoR(long b)1330 BtoR(long b)
1331 {
1332 
1333 	b &= 0xffffL;
1334 	if(b == 0)
1335 		return 0;
1336 	return bitno(b) + D_AX;
1337 }
1338 
1339 /*
1340  *	bit	reg
1341  *	16	X5
1342  *	17	X6
1343  *	18	X7
1344  */
1345 long
FtoB(int f)1346 FtoB(int f)
1347 {
1348 	if(f < FREGMIN || f > FREGEXT)
1349 		return 0;
1350 	return 1L << (f - FREGMIN + 16);
1351 }
1352 
1353 int
BtoF(long b)1354 BtoF(long b)
1355 {
1356 
1357 	b &= 0x70000L;
1358 	if(b == 0)
1359 		return 0;
1360 	return bitno(b) - 16 + FREGMIN;
1361 }
1362