xref: /plan9/sys/src/9/port/proc.c (revision f9e1cf08d3be51592e03e639fc848a68dc31a55e)
1 #include	<u.h>
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 #include	"edf.h"
8 #include	<trace.h>
9 
10 int	schedgain = 30;	/* units in seconds */
11 int	nrdy;
12 Ref	noteidalloc;
13 
14 void updatecpu(Proc*);
15 int reprioritize(Proc*);
16 
17 ulong	delayedscheds;	/* statistics */
18 long skipscheds;
19 long preempts;
20 ulong load;
21 
22 static Ref	pidalloc;
23 
24 static struct Procalloc
25 {
26 	Lock;
27 	Proc*	ht[128];
28 	Proc*	arena;
29 	Proc*	free;
30 } procalloc;
31 
32 enum
33 {
34 	Q=10,
35 	DQ=4,
36 	Scaling=2,
37 };
38 
39 Schedq	runq[Nrq];
40 ulong	runvec;
41 
42 char *statename[] =
43 {	/* BUG: generate automatically */
44 	"Dead",
45 	"Moribund",
46 	"Ready",
47 	"Scheding",
48 	"Running",
49 	"Queueing",
50 	"QueueingR",
51 	"QueueingW",
52 	"Wakeme",
53 	"Broken",
54 	"Stopped",
55 	"Rendez",
56 	"Waitrelease",
57 };
58 
59 static void pidhash(Proc*);
60 static void pidunhash(Proc*);
61 static void rebalance(void);
62 
63 /*
64  * Always splhi()'ed.
65  */
66 void
67 schedinit(void)		/* never returns */
68 {
69 	Edf *e;
70 
71 	setlabel(&m->sched);
72 	if(up) {
73 		if((e = up->edf) && (e->flags & Admitted))
74 			edfrecord(up);
75 		m->proc = 0;
76 		switch(up->state) {
77 		case Running:
78 			ready(up);
79 			break;
80 		case Moribund:
81 			up->state = Dead;
82 			edfstop(up);
83 			if (up->edf)
84 				free(up->edf);
85 			up->edf = nil;
86 
87 			/*
88 			 * Holding locks from pexit:
89 			 * 	procalloc
90 			 *	palloc
91 			 */
92 			mmurelease(up);
93 
94 			up->qnext = procalloc.free;
95 			procalloc.free = up;
96 
97 			unlock(&palloc);
98 			unlock(&procalloc);
99 			break;
100 		}
101 		up->mach = nil;
102 		updatecpu(up);
103 		up = nil;
104 	}
105 	sched();
106 }
107 
108 /*
109  *  If changing this routine, look also at sleep().  It
110  *  contains a copy of the guts of sched().
111  */
112 void
113 sched(void)
114 {
115 	Proc *p;
116 
117 	if(m->ilockdepth)
118 		panic("ilockdepth %d, last lock 0x%p at 0x%lux, sched called from 0x%lux",
119 			m->ilockdepth, up?up->lastilock:nil,
120 			(up && up->lastilock)?up->lastilock->pc:0,
121 			getcallerpc(&p+2));
122 
123 	if(up){
124 		/*
125 		 * Delay the sched until the process gives up the locks
126 		 * it is holding.  This avoids dumb lock loops.
127 		 * Don't delay if the process is Moribund.
128 		 * It called sched to die.
129 		 * But do sched eventually.  This avoids a missing unlock
130 		 * from hanging the entire kernel.
131 		 * But don't reschedule procs holding palloc or procalloc.
132 		 * Those are far too important to be holding while asleep.
133 		 *
134 		 * This test is not exact.  There can still be a few instructions
135 		 * in the middle of taslock when a process holds a lock
136 		 * but Lock.p has not yet been initialized.
137 		 */
138 		if(up->nlocks.ref)
139 		if(up->state != Moribund)
140 		if(up->delaysched < 20
141 		|| palloc.Lock.p == up
142 		|| procalloc.Lock.p == up){
143 			up->delaysched++;
144  			delayedscheds++;
145 			return;
146 		}
147 		up->delaysched = 0;
148 
149 		splhi();
150 
151 		/* statistics */
152 		m->cs++;
153 
154 		procsave(up);
155 		if(setlabel(&up->sched)){
156 			procrestore(up);
157 			spllo();
158 			return;
159 		}
160 		gotolabel(&m->sched);
161 	}
162 	p = runproc();
163 	if(!p->edf){
164 		updatecpu(p);
165 		p->priority = reprioritize(p);
166 	}
167 	if(p != m->readied)
168 		m->schedticks = m->ticks + HZ/10;
169 	m->readied = 0;
170 	up = p;
171 	up->state = Running;
172 	up->mach = MACHP(m->machno);
173 	m->proc = up;
174 	mmuswitch(up);
175 	gotolabel(&up->sched);
176 }
177 
178 int
179 anyready(void)
180 {
181 	return runvec;
182 }
183 
184 int
185 anyhigher(void)
186 {
187 	return runvec & ~((1<<(up->priority+1))-1);
188 }
189 
190 /*
191  *  here once per clock tick to see if we should resched
192  */
193 void
194 hzsched(void)
195 {
196 	/* once a second, rebalance will reprioritize ready procs */
197 	if(m->machno == 0)
198 		rebalance();
199 
200 	/* unless preempted, get to run for at least 100ms */
201 	if(anyhigher()
202 	|| (!up->fixedpri && m->ticks > m->schedticks && anyready())){
203 		m->readied = nil;	/* avoid cooperative scheduling */
204 		up->delaysched++;
205 	}
206 }
207 
208 /*
209  *  here at the end of non-clock interrupts to see if we should preempt the
210  *  current process.  Returns 1 if preempted, 0 otherwise.
211  */
212 int
213 preempted(void)
214 {
215 	if(up && up->state == Running)
216 	if(up->preempted == 0)
217 	if(anyhigher())
218 	if(!active.exiting){
219 		m->readied = nil;	/* avoid cooperative scheduling */
220 		up->preempted = 1;
221 		sched();
222 		splhi();
223 		up->preempted = 0;
224 		return 1;
225 	}
226 	return 0;
227 }
228 
229 /*
230  * Update the cpu time average for this particular process,
231  * which is about to change from up -> not up or vice versa.
232  * p->lastupdate is the last time an updatecpu happened.
233  *
234  * The cpu time average is a decaying average that lasts
235  * about D clock ticks.  D is chosen to be approximately
236  * the cpu time of a cpu-intensive "quick job".  A job has to run
237  * for approximately D clock ticks before we home in on its
238  * actual cpu usage.  Thus if you manage to get in and get out
239  * quickly, you won't be penalized during your burst.  Once you
240  * start using your share of the cpu for more than about D
241  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
242  * below all the other quick jobs.  Interactive tasks, because
243  * they basically always use less than their fair share of cpu,
244  * will be rewarded.
245  *
246  * If the process has not been running, then we want to
247  * apply the filter
248  *
249  *	cpu = cpu * (D-1)/D
250  *
251  * n times, yielding
252  *
253  *	cpu = cpu * ((D-1)/D)^n
254  *
255  * but D is big enough that this is approximately
256  *
257  * 	cpu = cpu * (D-n)/D
258  *
259  * so we use that instead.
260  *
261  * If the process has been running, we apply the filter to
262  * 1 - cpu, yielding a similar equation.  Note that cpu is
263  * stored in fixed point (* 1000).
264  *
265  * Updatecpu must be called before changing up, in order
266  * to maintain accurate cpu usage statistics.  It can be called
267  * at any time to bring the stats for a given proc up-to-date.
268  */
269 void
270 updatecpu(Proc *p)
271 {
272 	int n, t, ocpu;
273 	int D = schedgain*HZ*Scaling;
274 
275 	if(p->edf)
276 		return;
277 
278 	t = MACHP(0)->ticks*Scaling + Scaling/2;
279 	n = t - p->lastupdate;
280 	p->lastupdate = t;
281 
282 	if(n == 0)
283 		return;
284 	if(n > D)
285 		n = D;
286 
287 	ocpu = p->cpu;
288 	if(p != up)
289 		p->cpu = (ocpu*(D-n))/D;
290 	else{
291 		t = 1000 - ocpu;
292 		t = (t*(D-n))/D;
293 		p->cpu = 1000 - t;
294 	}
295 
296 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
297 }
298 
299 /*
300  * On average, p has used p->cpu of a cpu recently.
301  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
302  * too much, penalize it.  If it has been getting not enough, reward it.
303  * I don't think you can get much more than your fair share that
304  * often, so most of the queues are for using less.  Having a priority
305  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
306  * means you're not using as much as you could.
307  */
308 int
309 reprioritize(Proc *p)
310 {
311 	int fairshare, n, load, ratio;
312 
313 	load = MACHP(0)->load;
314 	if(load == 0)
315 		return p->basepri;
316 
317 	/*
318 	 *  fairshare = 1.000 * conf.nproc * 1.000/load,
319 	 * except the decimal point is moved three places
320 	 * on both load and fairshare.
321 	 */
322 	fairshare = (conf.nmach*1000*1000)/load;
323 	n = p->cpu;
324 	if(n == 0)
325 		n = 1;
326 	ratio = (fairshare+n/2) / n;
327 	if(ratio > p->basepri)
328 		ratio = p->basepri;
329 	if(ratio < 0)
330 		panic("reprioritize");
331 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
332 	return ratio;
333 }
334 
335 /*
336  * add a process to a scheduling queue
337  */
338 void
339 queueproc(Schedq *rq, Proc *p)
340 {
341 	int pri;
342 
343 	pri = rq - runq;
344 	lock(runq);
345 	p->priority = pri;
346 	p->rnext = 0;
347 	if(rq->tail)
348 		rq->tail->rnext = p;
349 	else
350 		rq->head = p;
351 	rq->tail = p;
352 	rq->n++;
353 	nrdy++;
354 	runvec |= 1<<pri;
355 	unlock(runq);
356 }
357 
358 /*
359  *  try to remove a process from a scheduling queue (called splhi)
360  */
361 Proc*
362 dequeueproc(Schedq *rq, Proc *tp)
363 {
364 	Proc *l, *p;
365 
366 	if(!canlock(runq))
367 		return nil;
368 
369 	/*
370 	 *  the queue may have changed before we locked runq,
371 	 *  refind the target process.
372 	 */
373 	l = 0;
374 	for(p = rq->head; p; p = p->rnext){
375 		if(p == tp)
376 			break;
377 		l = p;
378 	}
379 
380 	/*
381 	 *  p->mach==0 only when process state is saved
382 	 */
383 	if(p == 0 || p->mach){
384 		unlock(runq);
385 		return nil;
386 	}
387 	if(p->rnext == 0)
388 		rq->tail = l;
389 	if(l)
390 		l->rnext = p->rnext;
391 	else
392 		rq->head = p->rnext;
393 	if(rq->head == nil)
394 		runvec &= ~(1<<(rq-runq));
395 	rq->n--;
396 	nrdy--;
397 	if(p->state != Ready)
398 		print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
399 
400 	unlock(runq);
401 	return p;
402 }
403 
404 /*
405  *  ready(p) picks a new priority for a process and sticks it in the
406  *  runq for that priority.
407  */
408 void
409 ready(Proc *p)
410 {
411 	int s, pri;
412 	Schedq *rq;
413 	void (*pt)(Proc*, int, vlong);
414 
415 	s = splhi();
416 	if(edfready(p)){
417 		splx(s);
418 		return;
419 	}
420 
421 	if(up != p)
422 		m->readied = p;	/* group scheduling */
423 
424 	updatecpu(p);
425 	pri = reprioritize(p);
426 	p->priority = pri;
427 	rq = &runq[pri];
428 	p->state = Ready;
429 	queueproc(rq, p);
430 	pt = proctrace;
431 	if(pt)
432 		pt(p, SReady, 0);
433 	splx(s);
434 }
435 
436 /*
437  *  yield the processor and drop our priority
438  */
439 void
440 yield(void)
441 {
442 	if(anyready()){
443 		/* pretend we just used 1/2 tick */
444 		up->lastupdate -= Scaling/2;
445 		sched();
446 	}
447 }
448 
449 /*
450  *  recalculate priorities once a second.  We need to do this
451  *  since priorities will otherwise only be recalculated when
452  *  the running process blocks.
453  */
454 ulong balancetime;
455 
456 static void
457 rebalance(void)
458 {
459 	int pri, npri, t, x;
460 	Schedq *rq;
461 	Proc *p;
462 
463 	t = m->ticks;
464 	if(t - balancetime < HZ)
465 		return;
466 	balancetime = t;
467 
468 	for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
469 another:
470 		p = rq->head;
471 		if(p == nil)
472 			continue;
473 		if(p->mp != MACHP(m->machno))
474 			continue;
475 		if(pri == p->basepri)
476 			continue;
477 		updatecpu(p);
478 		npri = reprioritize(p);
479 		if(npri != pri){
480 			x = splhi();
481 			p = dequeueproc(rq, p);
482 			if(p)
483 				queueproc(&runq[npri], p);
484 			splx(x);
485 			goto another;
486 		}
487 	}
488 }
489 
490 
491 /*
492  *  pick a process to run
493  */
494 Proc*
495 runproc(void)
496 {
497 	Schedq *rq;
498 	Proc *p;
499 	ulong start, now;
500 	int i;
501 	void (*pt)(Proc*, int, vlong);
502 
503 	start = perfticks();
504 
505 	/* cooperative scheduling until the clock ticks */
506 	if((p=m->readied) && p->mach==0 && p->state==Ready
507 	&& runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
508 		skipscheds++;
509 		rq = &runq[p->priority];
510 		goto found;
511 	}
512 
513 	preempts++;
514 
515 loop:
516 	/*
517 	 *  find a process that last ran on this processor (affinity),
518 	 *  or one that hasn't moved in a while (load balancing).  Every
519 	 *  time around the loop affinity goes down.
520 	 */
521 	spllo();
522 	for(i = 0;; i++){
523 		/*
524 		 *  find the highest priority target process that this
525 		 *  processor can run given affinity constraints.
526 		 *
527 		 */
528 		for(rq = &runq[Nrq-1]; rq >= runq; rq--){
529 			for(p = rq->head; p; p = p->rnext){
530 				if(p->mp == nil || p->mp == MACHP(m->machno)
531 				|| (!p->wired && i > 0))
532 					goto found;
533 			}
534 		}
535 
536 		/* waste time or halt the CPU */
537 		idlehands();
538 
539 		/* remember how much time we're here */
540 		now = perfticks();
541 		m->perf.inidle += now-start;
542 		start = now;
543 	}
544 
545 found:
546 	splhi();
547 	p = dequeueproc(rq, p);
548 	if(p == nil)
549 		goto loop;
550 
551 	p->state = Scheding;
552 	p->mp = MACHP(m->machno);
553 
554 	if(edflock(p)){
555 		edfrun(p, rq == &runq[PriEdf]);	/* start deadline timer and do admin */
556 		edfunlock();
557 	}
558 	pt = proctrace;
559 	if(pt)
560 		pt(p, SRun, 0);
561 	return p;
562 }
563 
564 int
565 canpage(Proc *p)
566 {
567 	int ok = 0;
568 
569 	splhi();
570 	lock(runq);
571 	/* Only reliable way to see if we are Running */
572 	if(p->mach == 0) {
573 		p->newtlb = 1;
574 		ok = 1;
575 	}
576 	unlock(runq);
577 	spllo();
578 
579 	return ok;
580 }
581 
582 Proc*
583 newproc(void)
584 {
585 	Proc *p;
586 
587 	lock(&procalloc);
588 	for(;;) {
589 		if(p = procalloc.free)
590 			break;
591 
592 		unlock(&procalloc);
593 		resrcwait("no procs");
594 		lock(&procalloc);
595 	}
596 	procalloc.free = p->qnext;
597 	unlock(&procalloc);
598 
599 	p->state = Scheding;
600 	p->psstate = "New";
601 	p->mach = 0;
602 	p->qnext = 0;
603 	p->nchild = 0;
604 	p->nwait = 0;
605 	p->waitq = 0;
606 	p->parent = 0;
607 	p->pgrp = 0;
608 	p->egrp = 0;
609 	p->fgrp = 0;
610 	p->rgrp = 0;
611 	p->pdbg = 0;
612 	p->fpstate = FPinit;
613 	p->kp = 0;
614 	p->procctl = 0;
615 	p->notepending = 0;
616 	p->ureg = 0;
617 	p->privatemem = 0;
618 	p->noswap = 0;
619 	p->errstr = p->errbuf0;
620 	p->syserrstr = p->errbuf1;
621 	p->errbuf0[0] = '\0';
622 	p->errbuf1[0] = '\0';
623 	p->nlocks.ref = 0;
624 	p->delaysched = 0;
625 	p->trace = 0;
626 	kstrdup(&p->user, "*nouser");
627 	kstrdup(&p->text, "*notext");
628 	kstrdup(&p->args, "");
629 	p->nargs = 0;
630 	p->setargs = 0;
631 	memset(p->seg, 0, sizeof p->seg);
632 	p->pid = incref(&pidalloc);
633 	pidhash(p);
634 	p->noteid = incref(&noteidalloc);
635 	if(p->pid==0 || p->noteid==0)
636 		panic("pidalloc");
637 	if(p->kstack == 0)
638 		p->kstack = smalloc(KSTACK);
639 
640 	/* sched params */
641 	p->mp = 0;
642 	p->wired = 0;
643 	procpriority(p, PriNormal, 0);
644 	p->cpu = 0;
645 	p->lastupdate = MACHP(0)->ticks*Scaling;
646 	p->edf = nil;
647 
648 	return p;
649 }
650 
651 /*
652  * wire this proc to a machine
653  */
654 void
655 procwired(Proc *p, int bm)
656 {
657 	Proc *pp;
658 	int i;
659 	char nwired[MAXMACH];
660 	Mach *wm;
661 
662 	if(bm < 0){
663 		/* pick a machine to wire to */
664 		memset(nwired, 0, sizeof(nwired));
665 		p->wired = 0;
666 		pp = proctab(0);
667 		for(i=0; i<conf.nproc; i++, pp++){
668 			wm = pp->wired;
669 			if(wm && pp->pid)
670 				nwired[wm->machno]++;
671 		}
672 		bm = 0;
673 		for(i=0; i<conf.nmach; i++)
674 			if(nwired[i] < nwired[bm])
675 				bm = i;
676 	} else {
677 		/* use the virtual machine requested */
678 		bm = bm % conf.nmach;
679 	}
680 
681 	p->wired = MACHP(bm);
682 	p->mp = p->wired;
683 }
684 
685 void
686 procpriority(Proc *p, int pri, int fixed)
687 {
688 	if(pri >= Npriq)
689 		pri = Npriq - 1;
690 	else if(pri < 0)
691 		pri = 0;
692 	p->basepri = pri;
693 	p->priority = pri;
694 	if(fixed){
695 		p->fixedpri = 1;
696 	} else {
697 		p->fixedpri = 0;
698 	}
699 }
700 
701 void
702 procinit0(void)		/* bad planning - clashes with devproc.c */
703 {
704 	Proc *p;
705 	int i;
706 
707 	procalloc.free = xalloc(conf.nproc*sizeof(Proc));
708 	if(procalloc.free == nil){
709 		xsummary();
710 		panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
711 	}
712 	procalloc.arena = procalloc.free;
713 
714 	p = procalloc.free;
715 	for(i=0; i<conf.nproc-1; i++,p++)
716 		p->qnext = p+1;
717 	p->qnext = 0;
718 }
719 
720 /*
721  *  sleep if a condition is not true.  Another process will
722  *  awaken us after it sets the condition.  When we awaken
723  *  the condition may no longer be true.
724  *
725  *  we lock both the process and the rendezvous to keep r->p
726  *  and p->r synchronized.
727  */
728 void
729 sleep(Rendez *r, int (*f)(void*), void *arg)
730 {
731 	int s;
732 	void (*pt)(Proc*, int, vlong);
733 
734 	s = splhi();
735 
736 	if(up->nlocks.ref)
737 		print("process %lud sleeps with %lud locks held, last lock 0x%p locked at pc 0x%lux, sleep called from 0x%lux\n",
738 			up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
739 	lock(r);
740 	lock(&up->rlock);
741 	if(r->p){
742 		print("double sleep called from 0x%lux, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
743 		dumpstack();
744 	}
745 
746 	/*
747 	 *  Wakeup only knows there may be something to do by testing
748 	 *  r->p in order to get something to lock on.
749 	 *  Flush that information out to memory in case the sleep is
750 	 *  committed.
751 	 */
752 	r->p = up;
753 
754 	if((*f)(arg) || up->notepending){
755 		/*
756 		 *  if condition happened or a note is pending
757 		 *  never mind
758 		 */
759 		r->p = nil;
760 		unlock(&up->rlock);
761 		unlock(r);
762 	} else {
763 		/*
764 		 *  now we are committed to
765 		 *  change state and call scheduler
766 		 */
767 		pt = proctrace;
768 		if(pt)
769 			pt(up, SSleep, 0);
770 		up->state = Wakeme;
771 		up->r = r;
772 
773 		/* statistics */
774 		m->cs++;
775 
776 		procsave(up);
777 		if(setlabel(&up->sched)) {
778 			/*
779 			 *  here when the process is awakened
780 			 */
781 			procrestore(up);
782 			spllo();
783 		} else {
784 			/*
785 			 *  here to go to sleep (i.e. stop Running)
786 			 */
787 			unlock(&up->rlock);
788 			unlock(r);
789 			gotolabel(&m->sched);
790 		}
791 	}
792 
793 	if(up->notepending) {
794 		up->notepending = 0;
795 		splx(s);
796 		if(up->procctl == Proc_exitme && up->closingfgrp)
797 			forceclosefgrp();
798 		error(Eintr);
799 	}
800 
801 	splx(s);
802 }
803 
804 static int
805 tfn(void *arg)
806 {
807 	return up->trend == nil || up->tfn(arg);
808 }
809 
810 void
811 twakeup(Ureg*, Timer *t)
812 {
813 	Proc *p;
814 	Rendez *trend;
815 
816 	p = t->ta;
817 	trend = p->trend;
818 	p->trend = 0;
819 	if(trend)
820 		wakeup(trend);
821 }
822 
823 void
824 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
825 {
826 	if (up->tt){
827 		print("tsleep: timer active: mode %d, tf 0x%lux\n", up->tmode, up->tf);
828 		timerdel(up);
829 	}
830 	up->tns = MS2NS(ms);
831 	up->tf = twakeup;
832 	up->tmode = Trelative;
833 	up->ta = up;
834 	up->trend = r;
835 	up->tfn = fn;
836 	timeradd(up);
837 
838 	if(waserror()){
839 		timerdel(up);
840 		nexterror();
841 	}
842 	sleep(r, tfn, arg);
843 	if (up->tt)
844 		timerdel(up);
845 	up->twhen = 0;
846 	poperror();
847 }
848 
849 /*
850  *  Expects that only one process can call wakeup for any given Rendez.
851  *  We hold both locks to ensure that r->p and p->r remain consistent.
852  *  Richard Miller has a better solution that doesn't require both to
853  *  be held simultaneously, but I'm a paranoid - presotto.
854  */
855 Proc*
856 wakeup(Rendez *r)
857 {
858 	Proc *p;
859 	int s;
860 
861 	s = splhi();
862 
863 	lock(r);
864 	p = r->p;
865 
866 	if(p != nil){
867 		lock(&p->rlock);
868 		if(p->state != Wakeme || p->r != r){
869 			iprint("%p %p %d\n", p->r, r, p->state);
870 			panic("wakeup: state");
871 		}
872 		r->p = nil;
873 		p->r = nil;
874 		ready(p);
875 		unlock(&p->rlock);
876 	}
877 	unlock(r);
878 
879 	splx(s);
880 
881 	return p;
882 }
883 
884 /*
885  *  if waking a sleeping process, this routine must hold both
886  *  p->rlock and r->lock.  However, it can't know them in
887  *  the same order as wakeup causing a possible lock ordering
888  *  deadlock.  We break the deadlock by giving up the p->rlock
889  *  lock if we can't get the r->lock and retrying.
890  */
891 int
892 postnote(Proc *p, int dolock, char *n, int flag)
893 {
894 	int s, ret;
895 	Rendez *r;
896 	Proc *d, **l;
897 
898 	if(dolock)
899 		qlock(&p->debug);
900 
901 	if(flag != NUser && (p->notify == 0 || p->notified))
902 		p->nnote = 0;
903 
904 	ret = 0;
905 	if(p->nnote < NNOTE) {
906 		strcpy(p->note[p->nnote].msg, n);
907 		p->note[p->nnote++].flag = flag;
908 		ret = 1;
909 	}
910 	p->notepending = 1;
911 	if(dolock)
912 		qunlock(&p->debug);
913 
914 	/* this loop is to avoid lock ordering problems. */
915 	for(;;){
916 		s = splhi();
917 		lock(&p->rlock);
918 		r = p->r;
919 
920 		/* waiting for a wakeup? */
921 		if(r == nil)
922 			break;	/* no */
923 
924 		/* try for the second lock */
925 		if(canlock(r)){
926 			if(p->state != Wakeme || r->p != p)
927 				panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
928 			p->r = nil;
929 			r->p = nil;
930 			ready(p);
931 			unlock(r);
932 			break;
933 		}
934 
935 		/* give other process time to get out of critical section and try again */
936 		unlock(&p->rlock);
937 		splx(s);
938 		sched();
939 	}
940 	unlock(&p->rlock);
941 	splx(s);
942 
943 	if(p->state != Rendezvous)
944 		return ret;
945 
946 	/* Try and pull out of a rendezvous */
947 	lock(p->rgrp);
948 	if(p->state == Rendezvous) {
949 		p->rendval = ~0;
950 		l = &REND(p->rgrp, p->rendtag);
951 		for(d = *l; d; d = d->rendhash) {
952 			if(d == p) {
953 				*l = p->rendhash;
954 				break;
955 			}
956 			l = &d->rendhash;
957 		}
958 		ready(p);
959 	}
960 	unlock(p->rgrp);
961 	return ret;
962 }
963 
964 /*
965  * weird thing: keep at most NBROKEN around
966  */
967 #define	NBROKEN 4
968 struct
969 {
970 	QLock;
971 	int	n;
972 	Proc	*p[NBROKEN];
973 }broken;
974 
975 void
976 addbroken(Proc *p)
977 {
978 	qlock(&broken);
979 	if(broken.n == NBROKEN) {
980 		ready(broken.p[0]);
981 		memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
982 		--broken.n;
983 	}
984 	broken.p[broken.n++] = p;
985 	qunlock(&broken);
986 
987 	edfstop(up);
988 	p->state = Broken;
989 	p->psstate = 0;
990 	sched();
991 }
992 
993 void
994 unbreak(Proc *p)
995 {
996 	int b;
997 
998 	qlock(&broken);
999 	for(b=0; b < broken.n; b++)
1000 		if(broken.p[b] == p) {
1001 			broken.n--;
1002 			memmove(&broken.p[b], &broken.p[b+1],
1003 					sizeof(Proc*)*(NBROKEN-(b+1)));
1004 			ready(p);
1005 			break;
1006 		}
1007 	qunlock(&broken);
1008 }
1009 
1010 int
1011 freebroken(void)
1012 {
1013 	int i, n;
1014 
1015 	qlock(&broken);
1016 	n = broken.n;
1017 	for(i=0; i<n; i++) {
1018 		ready(broken.p[i]);
1019 		broken.p[i] = 0;
1020 	}
1021 	broken.n = 0;
1022 	qunlock(&broken);
1023 	return n;
1024 }
1025 
1026 void
1027 pexit(char *exitstr, int freemem)
1028 {
1029 	Proc *p;
1030 	Segment **s, **es;
1031 	long utime, stime;
1032 	Waitq *wq, *f, *next;
1033 	Fgrp *fgrp;
1034 	Egrp *egrp;
1035 	Rgrp *rgrp;
1036 	Pgrp *pgrp;
1037 	Chan *dot;
1038 	void (*pt)(Proc*, int, vlong);
1039 
1040 	up->alarm = 0;
1041 	if (up->tt)
1042 		timerdel(up);
1043 	pt = proctrace;
1044 	if(pt)
1045 		pt(up, SDead, 0);
1046 
1047 	/* nil out all the resources under lock (free later) */
1048 	qlock(&up->debug);
1049 	fgrp = up->fgrp;
1050 	up->fgrp = nil;
1051 	egrp = up->egrp;
1052 	up->egrp = nil;
1053 	rgrp = up->rgrp;
1054 	up->rgrp = nil;
1055 	pgrp = up->pgrp;
1056 	up->pgrp = nil;
1057 	dot = up->dot;
1058 	up->dot = nil;
1059 	qunlock(&up->debug);
1060 
1061 	if(fgrp)
1062 		closefgrp(fgrp);
1063 	if(egrp)
1064 		closeegrp(egrp);
1065 	if(rgrp)
1066 		closergrp(rgrp);
1067 	if(dot)
1068 		cclose(dot);
1069 	if(pgrp)
1070 		closepgrp(pgrp);
1071 
1072 	/*
1073 	 * if not a kernel process and have a parent,
1074 	 * do some housekeeping.
1075 	 */
1076 	if(up->kp == 0) {
1077 		p = up->parent;
1078 		if(p == 0) {
1079 			if(exitstr == 0)
1080 				exitstr = "unknown";
1081 			panic("boot process died: %s", exitstr);
1082 		}
1083 
1084 		while(waserror())
1085 			;
1086 
1087 		wq = smalloc(sizeof(Waitq));
1088 		poperror();
1089 
1090 		wq->w.pid = up->pid;
1091 		utime = up->time[TUser] + up->time[TCUser];
1092 		stime = up->time[TSys] + up->time[TCSys];
1093 		wq->w.time[TUser] = tk2ms(utime);
1094 		wq->w.time[TSys] = tk2ms(stime);
1095 		wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1096 		if(exitstr && exitstr[0])
1097 			snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1098 		else
1099 			wq->w.msg[0] = '\0';
1100 
1101 		lock(&p->exl);
1102 		/*
1103 		 * Check that parent is still alive.
1104 		 */
1105 		if(p->pid == up->parentpid && p->state != Broken) {
1106 			p->nchild--;
1107 			p->time[TCUser] += utime;
1108 			p->time[TCSys] += stime;
1109 			/*
1110 			 * If there would be more than 128 wait records
1111 			 * processes for my parent, then don't leave a wait
1112 			 * record behind.  This helps prevent badly written
1113 			 * daemon processes from accumulating lots of wait
1114 			 * records.
1115 		 	 */
1116 			if(p->nwait < 128) {
1117 				wq->next = p->waitq;
1118 				p->waitq = wq;
1119 				p->nwait++;
1120 				wq = nil;
1121 				wakeup(&p->waitr);
1122 			}
1123 		}
1124 		unlock(&p->exl);
1125 		if(wq)
1126 			free(wq);
1127 	}
1128 
1129 	if(!freemem)
1130 		addbroken(up);
1131 
1132 	qlock(&up->seglock);
1133 	es = &up->seg[NSEG];
1134 	for(s = up->seg; s < es; s++) {
1135 		if(*s) {
1136 			putseg(*s);
1137 			*s = 0;
1138 		}
1139 	}
1140 	qunlock(&up->seglock);
1141 
1142 	lock(&up->exl);		/* Prevent my children from leaving waits */
1143 	pidunhash(up);
1144 	up->pid = 0;
1145 	wakeup(&up->waitr);
1146 	unlock(&up->exl);
1147 
1148 	for(f = up->waitq; f; f = next) {
1149 		next = f->next;
1150 		free(f);
1151 	}
1152 
1153 	/* release debuggers */
1154 	qlock(&up->debug);
1155 	if(up->pdbg) {
1156 		wakeup(&up->pdbg->sleep);
1157 		up->pdbg = 0;
1158 	}
1159 	qunlock(&up->debug);
1160 
1161 	/* Sched must not loop for these locks */
1162 	lock(&procalloc);
1163 	lock(&palloc);
1164 
1165 	edfstop(up);
1166 	up->state = Moribund;
1167 	sched();
1168 	panic("pexit");
1169 }
1170 
1171 int
1172 haswaitq(void *x)
1173 {
1174 	Proc *p;
1175 
1176 	p = (Proc *)x;
1177 	return p->waitq != 0;
1178 }
1179 
1180 ulong
1181 pwait(Waitmsg *w)
1182 {
1183 	ulong cpid;
1184 	Waitq *wq;
1185 
1186 	if(!canqlock(&up->qwaitr))
1187 		error(Einuse);
1188 
1189 	if(waserror()) {
1190 		qunlock(&up->qwaitr);
1191 		nexterror();
1192 	}
1193 
1194 	lock(&up->exl);
1195 	if(up->nchild == 0 && up->waitq == 0) {
1196 		unlock(&up->exl);
1197 		error(Enochild);
1198 	}
1199 	unlock(&up->exl);
1200 
1201 	sleep(&up->waitr, haswaitq, up);
1202 
1203 	lock(&up->exl);
1204 	wq = up->waitq;
1205 	up->waitq = wq->next;
1206 	up->nwait--;
1207 	unlock(&up->exl);
1208 
1209 	qunlock(&up->qwaitr);
1210 	poperror();
1211 
1212 	if(w)
1213 		memmove(w, &wq->w, sizeof(Waitmsg));
1214 	cpid = wq->w.pid;
1215 	free(wq);
1216 	return cpid;
1217 }
1218 
1219 Proc*
1220 proctab(int i)
1221 {
1222 	return &procalloc.arena[i];
1223 }
1224 
1225 void
1226 dumpaproc(Proc *p)
1227 {
1228 	ulong bss;
1229 	char *s;
1230 
1231 	if(p == 0)
1232 		return;
1233 
1234 	bss = 0;
1235 	if(p->seg[BSEG])
1236 		bss = p->seg[BSEG]->top;
1237 
1238 	s = p->psstate;
1239 	if(s == 0)
1240 		s = statename[p->state];
1241 	print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1242 		p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1243 		p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1244 }
1245 
1246 void
1247 procdump(void)
1248 {
1249 	int i;
1250 	Proc *p;
1251 
1252 	if(up)
1253 		print("up %lud\n", up->pid);
1254 	else
1255 		print("no current process\n");
1256 	for(i=0; i<conf.nproc; i++) {
1257 		p = &procalloc.arena[i];
1258 		if(p->state == Dead)
1259 			continue;
1260 
1261 		dumpaproc(p);
1262 	}
1263 }
1264 
1265 /*
1266  *  wait till all processes have flushed their mmu
1267  *  state about segement s
1268  */
1269 void
1270 procflushseg(Segment *s)
1271 {
1272 	int i, ns, nm, nwait;
1273 	Proc *p;
1274 
1275 	/*
1276 	 *  tell all processes with this
1277 	 *  segment to flush their mmu's
1278 	 */
1279 	nwait = 0;
1280 	for(i=0; i<conf.nproc; i++) {
1281 		p = &procalloc.arena[i];
1282 		if(p->state == Dead)
1283 			continue;
1284 		for(ns = 0; ns < NSEG; ns++)
1285 			if(p->seg[ns] == s){
1286 				p->newtlb = 1;
1287 				for(nm = 0; nm < conf.nmach; nm++){
1288 					if(MACHP(nm)->proc == p){
1289 						MACHP(nm)->flushmmu = 1;
1290 						nwait++;
1291 					}
1292 				}
1293 				break;
1294 			}
1295 	}
1296 
1297 	if(nwait == 0)
1298 		return;
1299 
1300 	/*
1301 	 *  wait for all processors to take a clock interrupt
1302 	 *  and flush their mmu's
1303 	 */
1304 	for(nm = 0; nm < conf.nmach; nm++)
1305 		if(MACHP(nm) != m)
1306 			while(MACHP(nm)->flushmmu)
1307 				sched();
1308 }
1309 
1310 void
1311 scheddump(void)
1312 {
1313 	Proc *p;
1314 	Schedq *rq;
1315 
1316 	for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1317 		if(rq->head == 0)
1318 			continue;
1319 		print("rq%ld:", rq-runq);
1320 		for(p = rq->head; p; p = p->rnext)
1321 			print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1322 		print("\n");
1323 		delay(150);
1324 	}
1325 	print("nrdy %d\n", nrdy);
1326 }
1327 
1328 void
1329 kproc(char *name, void (*func)(void *), void *arg)
1330 {
1331 	Proc *p;
1332 	static Pgrp *kpgrp;
1333 
1334 	p = newproc();
1335 	p->psstate = 0;
1336 	p->procmode = 0640;
1337 	p->kp = 1;
1338 	p->noswap = 1;
1339 
1340 	p->fpsave = up->fpsave;
1341 	p->scallnr = up->scallnr;
1342 	p->s = up->s;
1343 	p->nerrlab = 0;
1344 	p->slash = up->slash;
1345 	p->dot = up->dot;
1346 	if(p->dot)
1347 		incref(p->dot);
1348 
1349 	memmove(p->note, up->note, sizeof(p->note));
1350 	p->nnote = up->nnote;
1351 	p->notified = 0;
1352 	p->lastnote = up->lastnote;
1353 	p->notify = up->notify;
1354 	p->ureg = 0;
1355 	p->dbgreg = 0;
1356 
1357 	procpriority(p, PriKproc, 0);
1358 
1359 	kprocchild(p, func, arg);
1360 
1361 	kstrdup(&p->user, eve);
1362 	kstrdup(&p->text, name);
1363 	if(kpgrp == 0)
1364 		kpgrp = newpgrp();
1365 	p->pgrp = kpgrp;
1366 	incref(kpgrp);
1367 
1368 	memset(p->time, 0, sizeof(p->time));
1369 	p->time[TReal] = MACHP(0)->ticks;
1370 	ready(p);
1371 	/*
1372 	 *  since the bss/data segments are now shareable,
1373 	 *  any mmu info about this process is now stale
1374 	 *  and has to be discarded.
1375 	 */
1376 	p->newtlb = 1;
1377 	flushmmu();
1378 }
1379 
1380 /*
1381  *  called splhi() by notify().  See comment in notify for the
1382  *  reasoning.
1383  */
1384 void
1385 procctl(Proc *p)
1386 {
1387 	char *state;
1388 	ulong s;
1389 
1390 	switch(p->procctl) {
1391 	case Proc_exitbig:
1392 		spllo();
1393 		pexit("Killed: Insufficient physical memory", 1);
1394 
1395 	case Proc_exitme:
1396 		spllo();		/* pexit has locks in it */
1397 		pexit("Killed", 1);
1398 
1399 	case Proc_traceme:
1400 		if(p->nnote == 0)
1401 			return;
1402 		/* No break */
1403 
1404 	case Proc_stopme:
1405 		p->procctl = 0;
1406 		state = p->psstate;
1407 		p->psstate = "Stopped";
1408 		/* free a waiting debugger */
1409 		s = spllo();
1410 		qlock(&p->debug);
1411 		if(p->pdbg) {
1412 			wakeup(&p->pdbg->sleep);
1413 			p->pdbg = 0;
1414 		}
1415 		qunlock(&p->debug);
1416 		splhi();
1417 		p->state = Stopped;
1418 		sched();
1419 		p->psstate = state;
1420 		splx(s);
1421 		return;
1422 	}
1423 }
1424 
1425 #include "errstr.h"
1426 
1427 void
1428 error(char *err)
1429 {
1430 	spllo();
1431 
1432 	assert(up->nerrlab < NERR);
1433 	kstrcpy(up->errstr, err, ERRMAX);
1434 	setlabel(&up->errlab[NERR-1]);
1435 	nexterror();
1436 }
1437 
1438 void
1439 nexterror(void)
1440 {
1441 	gotolabel(&up->errlab[--up->nerrlab]);
1442 }
1443 
1444 void
1445 exhausted(char *resource)
1446 {
1447 	char buf[ERRMAX];
1448 
1449 	sprint(buf, "no free %s", resource);
1450 	iprint("%s\n", buf);
1451 	error(buf);
1452 }
1453 
1454 void
1455 killbig(char *why)
1456 {
1457 	int i;
1458 	Segment *s;
1459 	ulong l, max;
1460 	Proc *p, *ep, *kp;
1461 
1462 	max = 0;
1463 	kp = 0;
1464 	ep = procalloc.arena+conf.nproc;
1465 	for(p = procalloc.arena; p < ep; p++) {
1466 		if(p->state == Dead || p->kp)
1467 			continue;
1468 		l = 0;
1469 		for(i=1; i<NSEG; i++) {
1470 			s = p->seg[i];
1471 			if(s != 0)
1472 				l += s->top - s->base;
1473 		}
1474 		if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1475 			kp = p;
1476 			max = l;
1477 		}
1478 	}
1479 
1480 	print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1481 	for(p = procalloc.arena; p < ep; p++) {
1482 		if(p->state == Dead || p->kp)
1483 			continue;
1484 		if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1485 			p->procctl = Proc_exitbig;
1486 	}
1487 	kp->procctl = Proc_exitbig;
1488 	for(i = 0; i < NSEG; i++) {
1489 		s = kp->seg[i];
1490 		if(s != 0 && canqlock(&s->lk)) {
1491 			mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1492 			qunlock(&s->lk);
1493 		}
1494 	}
1495 }
1496 
1497 /*
1498  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1499  *  eve changes.
1500  */
1501 void
1502 renameuser(char *old, char *new)
1503 {
1504 	Proc *p, *ep;
1505 
1506 	ep = procalloc.arena+conf.nproc;
1507 	for(p = procalloc.arena; p < ep; p++)
1508 		if(p->user!=nil && strcmp(old, p->user)==0)
1509 			kstrdup(&p->user, new);
1510 }
1511 
1512 /*
1513  *  time accounting called by clock() splhi'd
1514  */
1515 void
1516 accounttime(void)
1517 {
1518 	Proc *p;
1519 	ulong n, per;
1520 	static ulong nrun;
1521 
1522 	p = m->proc;
1523 	if(p) {
1524 		nrun++;
1525 		p->time[p->insyscall]++;
1526 	}
1527 
1528 	/* calculate decaying duty cycles */
1529 	n = perfticks();
1530 	per = n - m->perf.last;
1531 	m->perf.last = n;
1532 	per = (m->perf.period*(HZ-1) + per)/HZ;
1533 	if(per != 0)
1534 		m->perf.period = per;
1535 
1536 	m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1537 	m->perf.inidle = 0;
1538 
1539 	m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1540 	m->perf.inintr = 0;
1541 
1542 	/* only one processor gets to compute system load averages */
1543 	if(m->machno != 0)
1544 		return;
1545 
1546 	/*
1547 	 * calculate decaying load average.
1548 	 * if we decay by (n-1)/n then it takes
1549 	 * n clock ticks to go from load L to .36 L once
1550 	 * things quiet down.  it takes about 5 n clock
1551 	 * ticks to go to zero.  so using HZ means this is
1552 	 * approximately the load over the last second,
1553 	 * with a tail lasting about 5 seconds.
1554 	 */
1555 	n = nrun;
1556 	nrun = 0;
1557 	n = (nrdy+n)*1000;
1558 	m->load = (m->load*(HZ-1)+n)/HZ;
1559 }
1560 
1561 static void
1562 pidhash(Proc *p)
1563 {
1564 	int h;
1565 
1566 	h = p->pid % nelem(procalloc.ht);
1567 	lock(&procalloc);
1568 	p->pidhash = procalloc.ht[h];
1569 	procalloc.ht[h] = p;
1570 	unlock(&procalloc);
1571 }
1572 
1573 static void
1574 pidunhash(Proc *p)
1575 {
1576 	int h;
1577 	Proc **l;
1578 
1579 	h = p->pid % nelem(procalloc.ht);
1580 	lock(&procalloc);
1581 	for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1582 		if(*l == p){
1583 			*l = p->pidhash;
1584 			break;
1585 		}
1586 	unlock(&procalloc);
1587 }
1588 
1589 int
1590 procindex(ulong pid)
1591 {
1592 	Proc *p;
1593 	int h;
1594 	int s;
1595 
1596 	s = -1;
1597 	h = pid % nelem(procalloc.ht);
1598 	lock(&procalloc);
1599 	for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1600 		if(p->pid == pid){
1601 			s = p - procalloc.arena;
1602 			break;
1603 		}
1604 	unlock(&procalloc);
1605 	return s;
1606 }
1607