xref: /plan9/sys/src/9/port/proc.c (revision 4d44ba9b9ee4246ddbd96c7fcaf0918ab92ab35a)
1 #include	<u.h>
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 #include	"edf.h"
8 #include	<trace.h>
9 
10 int	coopsched;
11 int	schedgain = 30;	/* units in seconds */
12 int	nrdy;
13 Ref	noteidalloc;
14 
15 void updatecpu(Proc*);
16 int reprioritize(Proc*);
17 
18 ulong	delayedscheds;	/* statistics */
19 long skipscheds;
20 long preempts;
21 ulong load;
22 
23 static Ref	pidalloc;
24 
25 static struct Procalloc
26 {
27 	Lock;
28 	Proc*	ht[128];
29 	Proc*	arena;
30 	Proc*	free;
31 } procalloc;
32 
33 enum
34 {
35 	Q=10,
36 	DQ=4,
37 	Scaling=2,
38 };
39 
40 Schedq	runq[Nrq];
41 ulong	runvec;
42 
43 char *statename[] =
44 {	/* BUG: generate automatically */
45 	"Dead",
46 	"Moribund",
47 	"Ready",
48 	"Scheding",
49 	"Running",
50 	"Queueing",
51 	"QueueingR",
52 	"QueueingW",
53 	"Wakeme",
54 	"Broken",
55 	"Stopped",
56 	"Rendez",
57 	"Waitrelease",
58 };
59 
60 static void pidhash(Proc*);
61 static void pidunhash(Proc*);
62 static void rebalance(void);
63 
64 /*
65  * Always splhi()'ed.
66  */
67 void
68 schedinit(void)		/* never returns */
69 {
70 	Edf *e;
71 
72 	setlabel(&m->sched);
73 	if(up) {
74 		if((e = up->edf) && (e->flags & Admitted))
75 			edfrecord(up);
76 		m->proc = 0;
77 		switch(up->state) {
78 		case Running:
79 			ready(up);
80 			break;
81 		case Moribund:
82 			up->state = Dead;
83 			edfstop(up);
84 			if (up->edf)
85 				free(up->edf);
86 			up->edf = nil;
87 
88 			/*
89 			 * Holding locks from pexit:
90 			 * 	procalloc
91 			 *	palloc
92 			 */
93 			mmurelease(up);
94 
95 			up->qnext = procalloc.free;
96 			procalloc.free = up;
97 
98 			unlock(&palloc);
99 			unlock(&procalloc);
100 			break;
101 		}
102 		up->mach = nil;
103 		updatecpu(up);
104 		up = nil;
105 	}
106 	sched();
107 }
108 
109 /*
110  *  If changing this routine, look also at sleep().  It
111  *  contains a copy of the guts of sched().
112  */
113 void
114 sched(void)
115 {
116 	int x[1];
117 	Proc *p;
118 
119 	if(m->ilockdepth)
120 		panic("ilockdepth %d, last lock 0x%p at 0x%lux, sched called from 0x%lux",
121 			m->ilockdepth, up?up->lastilock:nil,
122 			(up && up->lastilock)?up->lastilock->pc:0, getcallerpc(x+3));
123 
124 	if(up){
125 		if(up->nlocks.ref && up->state != Moribund && up->delaysched < 20){
126 			up->delaysched++;
127  			delayedscheds++;
128 			return;
129 		}
130 		up->delaysched = 0;
131 
132 		splhi();
133 
134 		/* statistics */
135 		m->cs++;
136 
137 		procsave(up);
138 		if(setlabel(&up->sched)){
139 			procrestore(up);
140 			spllo();
141 			return;
142 		}
143 		gotolabel(&m->sched);
144 	}
145 	p = runproc();
146 	if(!p->edf){
147 		updatecpu(p);
148 		p->priority = reprioritize(p);
149 	}
150 	if(p != m->readied)
151 		m->schedticks = m->ticks + HZ/10;
152 	m->readied = 0;
153 	up = p;
154 	up->state = Running;
155 	up->mach = MACHP(m->machno);
156 	m->proc = up;
157 	mmuswitch(up);
158 	gotolabel(&up->sched);
159 }
160 
161 int
162 anyready(void)
163 {
164 	return runvec;
165 }
166 
167 int
168 anyhigher(void)
169 {
170 	return runvec & ~((1<<(up->priority+1))-1);
171 }
172 
173 /*
174  *  here once per clock tick to see if we should resched
175  */
176 void
177 hzsched(void)
178 {
179 	/* once a second, rebalance will reprioritize ready procs */
180 	if(m->machno == 0)
181 		rebalance();
182 
183 	/* unless preempted, get to run for at least 100ms */
184 	if(anyhigher()
185 	|| (!up->fixedpri && m->ticks > m->schedticks && anyready())){
186 		m->readied = nil;	/* avoid cooperative scheduling */
187 		up->delaysched++;
188 	}
189 }
190 
191 /*
192  *  here at the end of non-clock interrupts to see if we should preempt the
193  *  current process.  Returns 1 if preempted, 0 otherwise.
194  */
195 int
196 preempted(void)
197 {
198 	if(up && up->state == Running)
199 	if(up->preempted == 0)
200 	if(anyhigher())
201 	if(!active.exiting){
202 		m->readied = nil;	/* avoid cooperative scheduling */
203 		up->preempted = 1;
204 		sched();
205 		splhi();
206 		up->preempted = 0;
207 		return 1;
208 	}
209 	return 0;
210 }
211 
212 /*
213  * Update the cpu time average for this particular process,
214  * which is about to change from up -> not up or vice versa.
215  * p->lastupdate is the last time an updatecpu happened.
216  *
217  * The cpu time average is a decaying average that lasts
218  * about D clock ticks.  D is chosen to be approximately
219  * the cpu time of a cpu-intensive "quick job".  A job has to run
220  * for approximately D clock ticks before we home in on its
221  * actual cpu usage.  Thus if you manage to get in and get out
222  * quickly, you won't be penalized during your burst.  Once you
223  * start using your share of the cpu for more than about D
224  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
225  * below all the other quick jobs.  Interactive tasks, because
226  * they basically always use less than their fair share of cpu,
227  * will be rewarded.
228  *
229  * If the process has not been running, then we want to
230  * apply the filter
231  *
232  *	cpu = cpu * (D-1)/D
233  *
234  * n times, yielding
235  *
236  *	cpu = cpu * ((D-1)/D)^n
237  *
238  * but D is big enough that this is approximately
239  *
240  * 	cpu = cpu * (D-n)/D
241  *
242  * so we use that instead.
243  *
244  * If the process has been running, we apply the filter to
245  * 1 - cpu, yielding a similar equation.  Note that cpu is
246  * stored in fixed point (* 1000).
247  *
248  * Updatecpu must be called before changing up, in order
249  * to maintain accurate cpu usage statistics.  It can be called
250  * at any time to bring the stats for a given proc up-to-date.
251  */
252 void
253 updatecpu(Proc *p)
254 {
255 	int n, t, ocpu;
256 	int D = schedgain*HZ*Scaling;
257 
258 	if(p->edf)
259 		return;
260 
261 	t = MACHP(0)->ticks*Scaling + Scaling/2;
262 	n = t - p->lastupdate;
263 	p->lastupdate = t;
264 
265 	if(n == 0)
266 		return;
267 	if(n > D)
268 		n = D;
269 
270 	ocpu = p->cpu;
271 	if(p != up)
272 		p->cpu = (ocpu*(D-n))/D;
273 	else{
274 		t = 1000 - ocpu;
275 		t = (t*(D-n))/D;
276 		p->cpu = 1000 - t;
277 	}
278 
279 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
280 }
281 
282 /*
283  * On average, p has used p->cpu of a cpu recently.
284  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
285  * too much, penalize it.  If it has been getting not enough, reward it.
286  * I don't think you can get much more than your fair share that
287  * often, so most of the queues are for using less.  Having a priority
288  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
289  * means you're not using as much as you could.
290  */
291 int
292 reprioritize(Proc *p)
293 {
294 	int fairshare, n, load, ratio;
295 
296 	load = MACHP(0)->load;
297 	if(load == 0)
298 		return p->basepri;
299 
300 	/*
301 	 *  fairshare = 1.000 * conf.nproc * 1.000/load,
302 	 * except the decimal point is moved three places
303 	 * on both load and fairshare.
304 	 */
305 	fairshare = (conf.nmach*1000*1000)/load;
306 	n = p->cpu;
307 	if(n == 0)
308 		n = 1;
309 	ratio = (fairshare+n/2) / n;
310 	if(ratio > p->basepri)
311 		ratio = p->basepri;
312 	if(ratio < 0)
313 		panic("reprioritize");
314 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
315 	return ratio;
316 }
317 
318 /*
319  * add a process to a scheduling queue
320  */
321 void
322 queueproc(Schedq *rq, Proc *p)
323 {
324 	int pri;
325 
326 	pri = rq - runq;
327 	lock(runq);
328 	p->priority = pri;
329 	p->rnext = 0;
330 	if(rq->tail)
331 		rq->tail->rnext = p;
332 	else
333 		rq->head = p;
334 	rq->tail = p;
335 	rq->n++;
336 	nrdy++;
337 	runvec |= 1<<pri;
338 	unlock(runq);
339 }
340 
341 /*
342  *  try to remove a process from a scheduling queue (called splhi)
343  */
344 Proc*
345 dequeueproc(Schedq *rq, Proc *tp)
346 {
347 	Proc *l, *p;
348 
349 	if(!canlock(runq))
350 		return nil;
351 
352 	/*
353 	 *  the queue may have changed before we locked runq,
354 	 *  refind the target process.
355 	 */
356 	l = 0;
357 	for(p = rq->head; p; p = p->rnext){
358 		if(p == tp)
359 			break;
360 		l = p;
361 	}
362 
363 	/*
364 	 *  p->mach==0 only when process state is saved
365 	 */
366 	if(p == 0 || p->mach){
367 		unlock(runq);
368 		return nil;
369 	}
370 	if(p->rnext == 0)
371 		rq->tail = l;
372 	if(l)
373 		l->rnext = p->rnext;
374 	else
375 		rq->head = p->rnext;
376 	if(rq->head == nil)
377 		runvec &= ~(1<<(rq-runq));
378 	rq->n--;
379 	nrdy--;
380 	if(p->state != Ready)
381 		print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
382 
383 	unlock(runq);
384 	return p;
385 }
386 
387 /*
388  *  ready(p) picks a new priority for a process and sticks it in the
389  *  runq for that priority.
390  */
391 void
392 ready(Proc *p)
393 {
394 	int s, pri;
395 	Schedq *rq;
396 	void (*pt)(Proc*, int, vlong);
397 
398 	s = splhi();
399 	if(edfready(p)){
400 		splx(s);
401 		return;
402 	}
403 
404 	if(up != p)
405 		m->readied = p;	/* group scheduling */
406 
407 	updatecpu(p);
408 	pri = reprioritize(p);
409 	p->priority = pri;
410 	rq = &runq[pri];
411 	p->state = Ready;
412 	queueproc(rq, p);
413 	pt = proctrace;
414 	if(pt)
415 		pt(p, SReady, 0);
416 	splx(s);
417 }
418 
419 /*
420  *  yield the processor and drop our priority
421  */
422 void
423 yield(void)
424 {
425 	if(anyready()){
426 		/* pretend we just used 1/2 tick */
427 		up->lastupdate -= Scaling/2;
428 		sched();
429 	}
430 }
431 
432 /*
433  *  recalculate priorities once a second.  We need to do this
434  *  since priorities will otherwise only be recalculated when
435  *  the running process blocks.
436  */
437 ulong balancetime;
438 
439 static void
440 rebalance(void)
441 {
442 	int pri, npri, t, x;
443 	Schedq *rq;
444 	Proc *p;
445 
446 	t = m->ticks;
447 	if(t - balancetime < HZ)
448 		return;
449 	balancetime = t;
450 
451 	for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
452 another:
453 		p = rq->head;
454 		if(p == nil)
455 			continue;
456 		if(p->mp != MACHP(m->machno))
457 			continue;
458 		if(pri == p->basepri)
459 			continue;
460 		updatecpu(p);
461 		npri = reprioritize(p);
462 		if(npri != pri){
463 			x = splhi();
464 			p = dequeueproc(rq, p);
465 			if(p)
466 				queueproc(&runq[npri], p);
467 			splx(x);
468 			goto another;
469 		}
470 	}
471 }
472 
473 
474 /*
475  *  pick a process to run
476  */
477 Proc*
478 runproc(void)
479 {
480 	Schedq *rq;
481 	Proc *p;
482 	ulong start, now;
483 	int i;
484 	void (*pt)(Proc*, int, vlong);
485 
486 	start = perfticks();
487 
488 	/* cooperative scheduling until the clock ticks */
489 	if(coopsched && (p=m->readied) && p->mach==0 && p->state==Ready
490 	&& runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
491 		skipscheds++;
492 		rq = &runq[p->priority];
493 		goto found;
494 	}
495 
496 	preempts++;
497 
498 loop:
499 	/*
500 	 *  find a process that last ran on this processor (affinity),
501 	 *  or one that hasn't moved in a while (load balancing).  Every
502 	 *  time around the loop affinity goes down.
503 	 */
504 	spllo();
505 	for(i = 0;; i++){
506 		/*
507 		 *  find the highest priority target process that this
508 		 *  processor can run given affinity constraints.
509 		 *
510 		 */
511 		for(rq = &runq[Nrq-1]; rq >= runq; rq--){
512 			for(p = rq->head; p; p = p->rnext){
513 				if(p->mp == nil || p->mp == MACHP(m->machno)
514 				|| (!p->wired && i > 0))
515 					goto found;
516 			}
517 		}
518 
519 		/* waste time or halt the CPU */
520 		idlehands();
521 
522 		/* remember how much time we're here */
523 		now = perfticks();
524 		m->perf.inidle += now-start;
525 		start = now;
526 	}
527 
528 found:
529 	splhi();
530 	p = dequeueproc(rq, p);
531 	if(p == nil)
532 		goto loop;
533 
534 	p->state = Scheding;
535 	p->mp = MACHP(m->machno);
536 
537 	if(edflock(p)){
538 		edfrun(p, rq == &runq[PriEdf]);	/* start deadline timer and do admin */
539 		edfunlock();
540 	}
541 	pt = proctrace;
542 	if(pt)
543 		pt(p, SRun, 0);
544 	return p;
545 }
546 
547 int
548 canpage(Proc *p)
549 {
550 	int ok = 0;
551 
552 	splhi();
553 	lock(runq);
554 	/* Only reliable way to see if we are Running */
555 	if(p->mach == 0) {
556 		p->newtlb = 1;
557 		ok = 1;
558 	}
559 	unlock(runq);
560 	spllo();
561 
562 	return ok;
563 }
564 
565 Proc*
566 newproc(void)
567 {
568 	Proc *p;
569 
570 	lock(&procalloc);
571 	for(;;) {
572 		if(p = procalloc.free)
573 			break;
574 
575 		unlock(&procalloc);
576 		resrcwait("no procs");
577 		lock(&procalloc);
578 	}
579 	procalloc.free = p->qnext;
580 	unlock(&procalloc);
581 
582 	p->state = Scheding;
583 	p->psstate = "New";
584 	p->mach = 0;
585 	p->qnext = 0;
586 	p->nchild = 0;
587 	p->nwait = 0;
588 	p->waitq = 0;
589 	p->parent = 0;
590 	p->pgrp = 0;
591 	p->egrp = 0;
592 	p->fgrp = 0;
593 	p->rgrp = 0;
594 	p->pdbg = 0;
595 	p->fpstate = FPinit;
596 	p->kp = 0;
597 	p->procctl = 0;
598 	p->notepending = 0;
599 	p->ureg = 0;
600 	p->privatemem = 0;
601 	p->noswap = 0;
602 	p->errstr = p->errbuf0;
603 	p->syserrstr = p->errbuf1;
604 	p->errbuf0[0] = '\0';
605 	p->errbuf1[0] = '\0';
606 	p->nlocks.ref = 0;
607 	p->delaysched = 0;
608 	p->trace = 0;
609 	kstrdup(&p->user, "*nouser");
610 	kstrdup(&p->text, "*notext");
611 	kstrdup(&p->args, "");
612 	p->nargs = 0;
613 	p->setargs = 0;
614 	memset(p->seg, 0, sizeof p->seg);
615 	p->pid = incref(&pidalloc);
616 	pidhash(p);
617 	p->noteid = incref(&noteidalloc);
618 	if(p->pid==0 || p->noteid==0)
619 		panic("pidalloc");
620 	if(p->kstack == 0)
621 		p->kstack = smalloc(KSTACK);
622 
623 	/* sched params */
624 	p->mp = 0;
625 	p->wired = 0;
626 	procpriority(p, PriNormal, 0);
627 	p->cpu = 0;
628 	p->lastupdate = MACHP(0)->ticks*Scaling;
629 	p->edf = nil;
630 
631 	return p;
632 }
633 
634 /*
635  * wire this proc to a machine
636  */
637 void
638 procwired(Proc *p, int bm)
639 {
640 	Proc *pp;
641 	int i;
642 	char nwired[MAXMACH];
643 	Mach *wm;
644 
645 	if(bm < 0){
646 		/* pick a machine to wire to */
647 		memset(nwired, 0, sizeof(nwired));
648 		p->wired = 0;
649 		pp = proctab(0);
650 		for(i=0; i<conf.nproc; i++, pp++){
651 			wm = pp->wired;
652 			if(wm && pp->pid)
653 				nwired[wm->machno]++;
654 		}
655 		bm = 0;
656 		for(i=0; i<conf.nmach; i++)
657 			if(nwired[i] < nwired[bm])
658 				bm = i;
659 	} else {
660 		/* use the virtual machine requested */
661 		bm = bm % conf.nmach;
662 	}
663 
664 	p->wired = MACHP(bm);
665 	p->mp = p->wired;
666 }
667 
668 void
669 procpriority(Proc *p, int pri, int fixed)
670 {
671 	if(pri >= Npriq)
672 		pri = Npriq - 1;
673 	else if(pri < 0)
674 		pri = 0;
675 	p->basepri = pri;
676 	p->priority = pri;
677 	if(fixed){
678 		p->fixedpri = 1;
679 	} else {
680 		p->fixedpri = 0;
681 	}
682 }
683 
684 void
685 procinit0(void)		/* bad planning - clashes with devproc.c */
686 {
687 	Proc *p;
688 	int i;
689 
690 	procalloc.free = xalloc(conf.nproc*sizeof(Proc));
691 	if(procalloc.free == nil)
692 		panic("cannot allocate %lud procs\n", conf.nproc);
693 	procalloc.arena = procalloc.free;
694 
695 	p = procalloc.free;
696 	for(i=0; i<conf.nproc-1; i++,p++)
697 		p->qnext = p+1;
698 	p->qnext = 0;
699 }
700 
701 /*
702  *  sleep if a condition is not true.  Another process will
703  *  awaken us after it sets the condition.  When we awaken
704  *  the condition may no longer be true.
705  *
706  *  we lock both the process and the rendezvous to keep r->p
707  *  and p->r synchronized.
708  */
709 void
710 sleep(Rendez *r, int (*f)(void*), void *arg)
711 {
712 	int s;
713 	void (*pt)(Proc*, int, vlong);
714 
715 	s = splhi();
716 
717 	if(up->nlocks.ref)
718 		print("process %lud sleeps with %lud locks held, last lock 0x%p locked at pc 0x%lux, sleep called from 0x%lux\n",
719 			up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
720 	lock(r);
721 	lock(&up->rlock);
722 	if(r->p){
723 		print("double sleep called from 0x%lux, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
724 		dumpstack();
725 	}
726 
727 	/*
728 	 *  Wakeup only knows there may be something to do by testing
729 	 *  r->p in order to get something to lock on.
730 	 *  Flush that information out to memory in case the sleep is
731 	 *  committed.
732 	 */
733 	r->p = up;
734 
735 	if((*f)(arg) || up->notepending){
736 		/*
737 		 *  if condition happened or a note is pending
738 		 *  never mind
739 		 */
740 		r->p = nil;
741 		unlock(&up->rlock);
742 		unlock(r);
743 	} else {
744 		/*
745 		 *  now we are committed to
746 		 *  change state and call scheduler
747 		 */
748 		pt = proctrace;
749 		if(pt)
750 			pt(up, SSleep, 0);
751 		up->state = Wakeme;
752 		up->r = r;
753 
754 		/* statistics */
755 		m->cs++;
756 
757 		procsave(up);
758 		if(setlabel(&up->sched)) {
759 			/*
760 			 *  here when the process is awakened
761 			 */
762 			procrestore(up);
763 			spllo();
764 		} else {
765 			/*
766 			 *  here to go to sleep (i.e. stop Running)
767 			 */
768 			unlock(&up->rlock);
769 			unlock(r);
770 			gotolabel(&m->sched);
771 		}
772 	}
773 
774 	if(up->notepending) {
775 		up->notepending = 0;
776 		splx(s);
777 		error(Eintr);
778 	}
779 
780 	splx(s);
781 }
782 
783 static int
784 tfn(void *arg)
785 {
786 	return up->trend == nil || up->tfn(arg);
787 }
788 
789 void
790 twakeup(Ureg*, Timer *t)
791 {
792 	Proc *p;
793 	Rendez *trend;
794 
795 	p = t->ta;
796 	trend = p->trend;
797 	p->trend = 0;
798 	if(trend)
799 		wakeup(trend);
800 }
801 
802 /* Sleep until todget() >= ns.  NOTE: ns is not an interval */
803 void
804 tnsleep(Rendez *r, int (*fn)(void*), void *arg, vlong ns)
805 {
806 	if (up->tt){
807 		print("tnsleep: timer active: mode %d, tf 0x%lux\n", up->tmode, up->tf);
808 		timerdel(up);
809 	}
810 	up->tns = ns;
811 	up->tf = twakeup;
812 	up->tmode = Tabsolute;
813 	up->ta = up;
814 	up->trend = r;
815 	up->tfn = fn;
816 	timeradd(up);
817 
818 	if(waserror()){
819 		timerdel(up);
820 		nexterror();
821 	}
822 	sleep(r, tfn, arg);
823 	if (up->tt)
824 		timerdel(up);
825 	up->twhen = 0;
826 	poperror();
827 }
828 
829 void
830 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
831 {
832 	if (up->tt){
833 		print("tsleep: timer active: mode %d, tf 0x%lux\n", up->tmode, up->tf);
834 		timerdel(up);
835 	}
836 	up->tns = MS2NS(ms);
837 	up->tf = twakeup;
838 	up->tmode = Trelative;
839 	up->ta = up;
840 	up->trend = r;
841 	up->tfn = fn;
842 	timeradd(up);
843 
844 	if(waserror()){
845 		timerdel(up);
846 		nexterror();
847 	}
848 	sleep(r, tfn, arg);
849 	if (up->tt)
850 		timerdel(up);
851 	up->twhen = 0;
852 	poperror();
853 }
854 
855 /*
856  *  Expects that only one process can call wakeup for any given Rendez.
857  *  We hold both locks to ensure that r->p and p->r remain consistent.
858  *  Richard Miller has a better solution that doesn't require both to
859  *  be held simultaneously, but I'm a paranoid - presotto.
860  */
861 Proc*
862 wakeup(Rendez *r)
863 {
864 	Proc *p;
865 	int s;
866 
867 	s = splhi();
868 
869 	lock(r);
870 	p = r->p;
871 
872 	if(p != nil){
873 		lock(&p->rlock);
874 		if(p->state != Wakeme || p->r != r)
875 			panic("wakeup: state");
876 		r->p = nil;
877 		p->r = nil;
878 		ready(p);
879 		unlock(&p->rlock);
880 	}
881 	unlock(r);
882 
883 	splx(s);
884 
885 	return p;
886 }
887 
888 /*
889  *  if waking a sleeping process, this routine must hold both
890  *  p->rlock and r->lock.  However, it can't know them in
891  *  the same order as wakeup causing a possible lock ordering
892  *  deadlock.  We break the deadlock by giving up the p->rlock
893  *  lock if we can't get the r->lock and retrying.
894  */
895 int
896 postnote(Proc *p, int dolock, char *n, int flag)
897 {
898 	int s, ret;
899 	Rendez *r;
900 	Proc *d, **l;
901 
902 	if(dolock)
903 		qlock(&p->debug);
904 
905 	if(flag != NUser && (p->notify == 0 || p->notified))
906 		p->nnote = 0;
907 
908 	ret = 0;
909 	if(p->nnote < NNOTE) {
910 		strcpy(p->note[p->nnote].msg, n);
911 		p->note[p->nnote++].flag = flag;
912 		ret = 1;
913 	}
914 	p->notepending = 1;
915 	if(dolock)
916 		qunlock(&p->debug);
917 
918 	/* this loop is to avoid lock ordering problems. */
919 	for(;;){
920 		s = splhi();
921 		lock(&p->rlock);
922 		r = p->r;
923 
924 		/* waiting for a wakeup? */
925 		if(r == nil)
926 			break;	/* no */
927 
928 		/* try for the second lock */
929 		if(canlock(r)){
930 			if(p->state != Wakeme || r->p != p)
931 				panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
932 			p->r = nil;
933 			r->p = nil;
934 			ready(p);
935 			unlock(r);
936 			break;
937 		}
938 
939 		/* give other process time to get out of critical section and try again */
940 		unlock(&p->rlock);
941 		splx(s);
942 		sched();
943 	}
944 	unlock(&p->rlock);
945 	splx(s);
946 
947 	if(p->state != Rendezvous)
948 		return ret;
949 
950 	/* Try and pull out of a rendezvous */
951 	lock(p->rgrp);
952 	if(p->state == Rendezvous) {
953 		p->rendval = ~0;
954 		l = &REND(p->rgrp, p->rendtag);
955 		for(d = *l; d; d = d->rendhash) {
956 			if(d == p) {
957 				*l = p->rendhash;
958 				break;
959 			}
960 			l = &d->rendhash;
961 		}
962 		ready(p);
963 	}
964 	unlock(p->rgrp);
965 	return ret;
966 }
967 
968 /*
969  * weird thing: keep at most NBROKEN around
970  */
971 #define	NBROKEN 4
972 struct
973 {
974 	QLock;
975 	int	n;
976 	Proc	*p[NBROKEN];
977 }broken;
978 
979 void
980 addbroken(Proc *p)
981 {
982 	qlock(&broken);
983 	if(broken.n == NBROKEN) {
984 		ready(broken.p[0]);
985 		memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
986 		--broken.n;
987 	}
988 	broken.p[broken.n++] = p;
989 	qunlock(&broken);
990 
991 	edfstop(up);
992 	p->state = Broken;
993 	p->psstate = 0;
994 	sched();
995 }
996 
997 void
998 unbreak(Proc *p)
999 {
1000 	int b;
1001 
1002 	qlock(&broken);
1003 	for(b=0; b < broken.n; b++)
1004 		if(broken.p[b] == p) {
1005 			broken.n--;
1006 			memmove(&broken.p[b], &broken.p[b+1],
1007 					sizeof(Proc*)*(NBROKEN-(b+1)));
1008 			ready(p);
1009 			break;
1010 		}
1011 	qunlock(&broken);
1012 }
1013 
1014 int
1015 freebroken(void)
1016 {
1017 	int i, n;
1018 
1019 	qlock(&broken);
1020 	n = broken.n;
1021 	for(i=0; i<n; i++) {
1022 		ready(broken.p[i]);
1023 		broken.p[i] = 0;
1024 	}
1025 	broken.n = 0;
1026 	qunlock(&broken);
1027 	return n;
1028 }
1029 
1030 void
1031 pexit(char *exitstr, int freemem)
1032 {
1033 	Proc *p;
1034 	Segment **s, **es;
1035 	long utime, stime;
1036 	Waitq *wq, *f, *next;
1037 	Fgrp *fgrp;
1038 	Egrp *egrp;
1039 	Rgrp *rgrp;
1040 	Pgrp *pgrp;
1041 	Chan *dot;
1042 	void (*pt)(Proc*, int, vlong);
1043 
1044 	up->alarm = 0;
1045 	if (up->tt)
1046 		timerdel(up);
1047 	pt = proctrace;
1048 	if(pt)
1049 		pt(up, SDead, 0);
1050 
1051 	/* nil out all the resources under lock (free later) */
1052 	qlock(&up->debug);
1053 	fgrp = up->fgrp;
1054 	up->fgrp = nil;
1055 	egrp = up->egrp;
1056 	up->egrp = nil;
1057 	rgrp = up->rgrp;
1058 	up->rgrp = nil;
1059 	pgrp = up->pgrp;
1060 	up->pgrp = nil;
1061 	dot = up->dot;
1062 	up->dot = nil;
1063 	qunlock(&up->debug);
1064 
1065 	if(fgrp)
1066 		closefgrp(fgrp);
1067 	if(egrp)
1068 		closeegrp(egrp);
1069 	if(rgrp)
1070 		closergrp(rgrp);
1071 	if(dot)
1072 		cclose(dot);
1073 	if(pgrp)
1074 		closepgrp(pgrp);
1075 
1076 	/*
1077 	 * if not a kernel process and have a parent,
1078 	 * do some housekeeping.
1079 	 */
1080 	if(up->kp == 0) {
1081 		p = up->parent;
1082 		if(p == 0) {
1083 			if(exitstr == 0)
1084 				exitstr = "unknown";
1085 			panic("boot process died: %s", exitstr);
1086 		}
1087 
1088 		while(waserror())
1089 			;
1090 
1091 		wq = smalloc(sizeof(Waitq));
1092 		poperror();
1093 
1094 		wq->w.pid = up->pid;
1095 		utime = up->time[TUser] + up->time[TCUser];
1096 		stime = up->time[TSys] + up->time[TCSys];
1097 		wq->w.time[TUser] = tk2ms(utime);
1098 		wq->w.time[TSys] = tk2ms(stime);
1099 		wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1100 		if(exitstr && exitstr[0])
1101 			snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1102 		else
1103 			wq->w.msg[0] = '\0';
1104 
1105 		lock(&p->exl);
1106 		/*
1107 		 *  If my parent is no longer alive, or if there would be more
1108 		 *  than 128 zombie child processes for my parent, then don't
1109 		 *  leave a wait record behind.  This helps prevent badly
1110 		 *  written daemon processes from accumulating lots of wait
1111 		 *  records.
1112 		 */
1113 		if(p->pid == up->parentpid && p->state != Broken && p->nwait < 128) {
1114 			p->nchild--;
1115 			p->time[TCUser] += utime;
1116 			p->time[TCSys] += stime;
1117 
1118 			wq->next = p->waitq;
1119 			p->waitq = wq;
1120 			p->nwait++;
1121 
1122 			wakeup(&p->waitr);
1123 			unlock(&p->exl);
1124 		}
1125 		else {
1126 			unlock(&p->exl);
1127 			free(wq);
1128 		}
1129 	}
1130 
1131 	if(!freemem)
1132 		addbroken(up);
1133 
1134 	qlock(&up->seglock);
1135 	es = &up->seg[NSEG];
1136 	for(s = up->seg; s < es; s++) {
1137 		if(*s) {
1138 			putseg(*s);
1139 			*s = 0;
1140 		}
1141 	}
1142 	qunlock(&up->seglock);
1143 
1144 	lock(&up->exl);		/* Prevent my children from leaving waits */
1145 	pidunhash(up);
1146 	up->pid = 0;
1147 	wakeup(&up->waitr);
1148 	unlock(&up->exl);
1149 
1150 	for(f = up->waitq; f; f = next) {
1151 		next = f->next;
1152 		free(f);
1153 	}
1154 
1155 	/* release debuggers */
1156 	qlock(&up->debug);
1157 	if(up->pdbg) {
1158 		wakeup(&up->pdbg->sleep);
1159 		up->pdbg = 0;
1160 	}
1161 	qunlock(&up->debug);
1162 
1163 	/* Sched must not loop for these locks */
1164 	lock(&procalloc);
1165 	lock(&palloc);
1166 
1167 	edfstop(up);
1168 	up->state = Moribund;
1169 	sched();
1170 	panic("pexit");
1171 }
1172 
1173 int
1174 haswaitq(void *x)
1175 {
1176 	Proc *p;
1177 
1178 	p = (Proc *)x;
1179 	return p->waitq != 0;
1180 }
1181 
1182 ulong
1183 pwait(Waitmsg *w)
1184 {
1185 	ulong cpid;
1186 	Waitq *wq;
1187 
1188 	if(!canqlock(&up->qwaitr))
1189 		error(Einuse);
1190 
1191 	if(waserror()) {
1192 		qunlock(&up->qwaitr);
1193 		nexterror();
1194 	}
1195 
1196 	lock(&up->exl);
1197 	if(up->nchild == 0 && up->waitq == 0) {
1198 		unlock(&up->exl);
1199 		error(Enochild);
1200 	}
1201 	unlock(&up->exl);
1202 
1203 	sleep(&up->waitr, haswaitq, up);
1204 
1205 	lock(&up->exl);
1206 	wq = up->waitq;
1207 	up->waitq = wq->next;
1208 	up->nwait--;
1209 	unlock(&up->exl);
1210 
1211 	qunlock(&up->qwaitr);
1212 	poperror();
1213 
1214 	if(w)
1215 		memmove(w, &wq->w, sizeof(Waitmsg));
1216 	cpid = wq->w.pid;
1217 	free(wq);
1218 	return cpid;
1219 }
1220 
1221 Proc*
1222 proctab(int i)
1223 {
1224 	return &procalloc.arena[i];
1225 }
1226 
1227 void
1228 dumpaproc(Proc *p)
1229 {
1230 	ulong bss;
1231 	char *s;
1232 
1233 	if(p == 0)
1234 		return;
1235 
1236 	bss = 0;
1237 	if(p->seg[BSEG])
1238 		bss = p->seg[BSEG]->top;
1239 
1240 	s = p->psstate;
1241 	if(s == 0)
1242 		s = statename[p->state];
1243 	print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1244 		p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1245 		p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1246 }
1247 
1248 void
1249 procdump(void)
1250 {
1251 	int i;
1252 	Proc *p;
1253 
1254 	if(up)
1255 		print("up %lud\n", up->pid);
1256 	else
1257 		print("no current process\n");
1258 	for(i=0; i<conf.nproc; i++) {
1259 		p = &procalloc.arena[i];
1260 		if(p->state == Dead)
1261 			continue;
1262 
1263 		dumpaproc(p);
1264 	}
1265 }
1266 
1267 /*
1268  *  wait till all processes have flushed their mmu
1269  *  state about segement s
1270  */
1271 void
1272 procflushseg(Segment *s)
1273 {
1274 	int i, ns, nm, nwait;
1275 	Proc *p;
1276 
1277 	/*
1278 	 *  tell all processes with this
1279 	 *  segment to flush their mmu's
1280 	 */
1281 	nwait = 0;
1282 	for(i=0; i<conf.nproc; i++) {
1283 		p = &procalloc.arena[i];
1284 		if(p->state == Dead)
1285 			continue;
1286 		for(ns = 0; ns < NSEG; ns++)
1287 			if(p->seg[ns] == s){
1288 				p->newtlb = 1;
1289 				for(nm = 0; nm < conf.nmach; nm++){
1290 					if(MACHP(nm)->proc == p){
1291 						MACHP(nm)->flushmmu = 1;
1292 						nwait++;
1293 					}
1294 				}
1295 				break;
1296 			}
1297 	}
1298 
1299 	if(nwait == 0)
1300 		return;
1301 
1302 	/*
1303 	 *  wait for all processors to take a clock interrupt
1304 	 *  and flush their mmu's
1305 	 */
1306 	for(nm = 0; nm < conf.nmach; nm++)
1307 		if(MACHP(nm) != m)
1308 			while(MACHP(nm)->flushmmu)
1309 				sched();
1310 }
1311 
1312 void
1313 scheddump(void)
1314 {
1315 	Proc *p;
1316 	Schedq *rq;
1317 
1318 	for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1319 		if(rq->head == 0)
1320 			continue;
1321 		print("rq%ld:", rq-runq);
1322 		for(p = rq->head; p; p = p->rnext)
1323 			print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1324 		print("\n");
1325 		delay(150);
1326 	}
1327 	print("nrdy %d\n", nrdy);
1328 }
1329 
1330 void
1331 kproc(char *name, void (*func)(void *), void *arg)
1332 {
1333 	Proc *p;
1334 	static Pgrp *kpgrp;
1335 
1336 	p = newproc();
1337 	p->psstate = 0;
1338 	p->procmode = 0640;
1339 	p->kp = 1;
1340 	p->noswap = 1;
1341 
1342 	p->fpsave = up->fpsave;
1343 	p->scallnr = up->scallnr;
1344 	p->s = up->s;
1345 	p->nerrlab = 0;
1346 	p->slash = up->slash;
1347 	p->dot = up->dot;
1348 	incref(p->dot);
1349 
1350 	memmove(p->note, up->note, sizeof(p->note));
1351 	p->nnote = up->nnote;
1352 	p->notified = 0;
1353 	p->lastnote = up->lastnote;
1354 	p->notify = up->notify;
1355 	p->ureg = 0;
1356 	p->dbgreg = 0;
1357 
1358 	procpriority(p, PriKproc, 0);
1359 
1360 	kprocchild(p, func, arg);
1361 
1362 	kstrdup(&p->user, eve);
1363 	kstrdup(&p->text, name);
1364 	if(kpgrp == 0)
1365 		kpgrp = newpgrp();
1366 	p->pgrp = kpgrp;
1367 	incref(kpgrp);
1368 
1369 	memset(p->time, 0, sizeof(p->time));
1370 	p->time[TReal] = MACHP(0)->ticks;
1371 	ready(p);
1372 	/*
1373 	 *  since the bss/data segments are now shareable,
1374 	 *  any mmu info about this process is now stale
1375 	 *  and has to be discarded.
1376 	 */
1377 	p->newtlb = 1;
1378 	flushmmu();
1379 }
1380 
1381 /*
1382  *  called splhi() by notify().  See comment in notify for the
1383  *  reasoning.
1384  */
1385 void
1386 procctl(Proc *p)
1387 {
1388 	char *state;
1389 	ulong s;
1390 
1391 	switch(p->procctl) {
1392 	case Proc_exitbig:
1393 		spllo();
1394 		pexit("Killed: Insufficient physical memory", 1);
1395 
1396 	case Proc_exitme:
1397 		spllo();		/* pexit has locks in it */
1398 		pexit("Killed", 1);
1399 
1400 	case Proc_traceme:
1401 		if(p->nnote == 0)
1402 			return;
1403 		/* No break */
1404 
1405 	case Proc_stopme:
1406 		p->procctl = 0;
1407 		state = p->psstate;
1408 		p->psstate = "Stopped";
1409 		/* free a waiting debugger */
1410 		s = spllo();
1411 		qlock(&p->debug);
1412 		if(p->pdbg) {
1413 			wakeup(&p->pdbg->sleep);
1414 			p->pdbg = 0;
1415 		}
1416 		qunlock(&p->debug);
1417 		splhi();
1418 		p->state = Stopped;
1419 		sched();
1420 		p->psstate = state;
1421 		splx(s);
1422 		return;
1423 	}
1424 }
1425 
1426 #include "errstr.h"
1427 
1428 void
1429 error(char *err)
1430 {
1431 	spllo();
1432 
1433 	assert(up->nerrlab < NERR);
1434 	kstrcpy(up->errstr, err, ERRMAX);
1435 	setlabel(&up->errlab[NERR-1]);
1436 	nexterror();
1437 }
1438 
1439 void
1440 nexterror(void)
1441 {
1442 	gotolabel(&up->errlab[--up->nerrlab]);
1443 }
1444 
1445 void
1446 exhausted(char *resource)
1447 {
1448 	char buf[ERRMAX];
1449 
1450 	sprint(buf, "no free %s", resource);
1451 	iprint("%s\n", buf);
1452 	error(buf);
1453 }
1454 
1455 void
1456 killbig(void)
1457 {
1458 	int i;
1459 	Segment *s;
1460 	ulong l, max;
1461 	Proc *p, *ep, *kp;
1462 
1463 	max = 0;
1464 	kp = 0;
1465 	ep = procalloc.arena+conf.nproc;
1466 	for(p = procalloc.arena; p < ep; p++) {
1467 		if(p->state == Dead || p->kp)
1468 			continue;
1469 		l = 0;
1470 		for(i=1; i<NSEG; i++) {
1471 			s = p->seg[i];
1472 			if(s != 0)
1473 				l += s->top - s->base;
1474 		}
1475 		if(l > max && strcmp(p->text, "kfs") != 0){
1476 			kp = p;
1477 			max = l;
1478 		}
1479 	}
1480 	kp->procctl = Proc_exitbig;
1481 	for(i = 0; i < NSEG; i++) {
1482 		s = kp->seg[i];
1483 		if(s != 0 && canqlock(&s->lk)) {
1484 			mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1485 			qunlock(&s->lk);
1486 		}
1487 	}
1488 	print("%lud: %s killed because no swap configured\n", kp->pid, kp->text);
1489 }
1490 
1491 /*
1492  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1493  *  eve changes.
1494  */
1495 void
1496 renameuser(char *old, char *new)
1497 {
1498 	Proc *p, *ep;
1499 
1500 	ep = procalloc.arena+conf.nproc;
1501 	for(p = procalloc.arena; p < ep; p++)
1502 		if(p->user!=nil && strcmp(old, p->user)==0)
1503 			kstrdup(&p->user, new);
1504 }
1505 
1506 /*
1507  *  time accounting called by clock() splhi'd
1508  */
1509 void
1510 accounttime(void)
1511 {
1512 	Proc *p;
1513 	ulong n, per;
1514 	static ulong nrun;
1515 
1516 	p = m->proc;
1517 	if(p) {
1518 		nrun++;
1519 		p->time[p->insyscall]++;
1520 	}
1521 
1522 	/* calculate decaying duty cycles */
1523 	n = perfticks();
1524 	per = n - m->perf.last;
1525 	m->perf.last = n;
1526 	per = (m->perf.period*(HZ-1) + per)/HZ;
1527 	if(per != 0)
1528 		m->perf.period = per;
1529 
1530 	m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1531 	m->perf.inidle = 0;
1532 
1533 	m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1534 	m->perf.inintr = 0;
1535 
1536 	/* only one processor gets to compute system load averages */
1537 	if(m->machno != 0)
1538 		return;
1539 
1540 	/*
1541 	 * calculate decaying load average.
1542 	 * if we decay by (n-1)/n then it takes
1543 	 * n clock ticks to go from load L to .36 L once
1544 	 * things quiet down.  it takes about 5 n clock
1545 	 * ticks to go to zero.  so using HZ means this is
1546 	 * approximately the load over the last second,
1547 	 * with a tail lasting about 5 seconds.
1548 	 */
1549 	n = nrun;
1550 	nrun = 0;
1551 	n = (nrdy+n)*1000;
1552 	m->load = (m->load*(HZ-1)+n)/HZ;
1553 }
1554 
1555 static void
1556 pidhash(Proc *p)
1557 {
1558 	int h;
1559 
1560 	h = p->pid % nelem(procalloc.ht);
1561 	lock(&procalloc);
1562 	p->pidhash = procalloc.ht[h];
1563 	procalloc.ht[h] = p;
1564 	unlock(&procalloc);
1565 }
1566 
1567 static void
1568 pidunhash(Proc *p)
1569 {
1570 	int h;
1571 	Proc **l;
1572 
1573 	h = p->pid % nelem(procalloc.ht);
1574 	lock(&procalloc);
1575 	for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1576 		if(*l == p){
1577 			*l = p->pidhash;
1578 			break;
1579 		}
1580 	unlock(&procalloc);
1581 }
1582 
1583 int
1584 procindex(ulong pid)
1585 {
1586 	Proc *p;
1587 	int h;
1588 	int s;
1589 
1590 	s = -1;
1591 	h = pid % nelem(procalloc.ht);
1592 	lock(&procalloc);
1593 	for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1594 		if(p->pid == pid){
1595 			s = p - procalloc.arena;
1596 			break;
1597 		}
1598 	unlock(&procalloc);
1599 	return s;
1600 }
1601